[Openmp-commits] [openmp] dee7704 - [AMDGPU] Add __builtin_amdgcn_grid_size
Jon Chesterfield via Openmp-commits
openmp-commits at lists.llvm.org
Thu Oct 29 09:25:32 PDT 2020
Author: Jon Chesterfield
Date: 2020-10-29T16:25:13Z
New Revision: dee7704829bd421ad3cce4b2132d28f4459b7319
URL: https://github.com/llvm/llvm-project/commit/dee7704829bd421ad3cce4b2132d28f4459b7319
DIFF: https://github.com/llvm/llvm-project/commit/dee7704829bd421ad3cce4b2132d28f4459b7319.diff
LOG: [AMDGPU] Add __builtin_amdgcn_grid_size
[AMDGPU] Add __builtin_amdgcn_grid_size
Similar to D76772, loads the data from the dispatch pointer. Marked invariant.
Patch also updates the openmp devicertl to use this builtin.
Reviewed By: yaxunl
Differential Revision: https://reviews.llvm.org/D90251
Added:
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenOpenCL/builtins-amdgcn.cl
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 042a86368559..f5901e6f8f3b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -37,6 +37,10 @@ BUILTIN(__builtin_amdgcn_workgroup_size_x, "Us", "nc")
BUILTIN(__builtin_amdgcn_workgroup_size_y, "Us", "nc")
BUILTIN(__builtin_amdgcn_workgroup_size_z, "Us", "nc")
+BUILTIN(__builtin_amdgcn_grid_size_x, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_grid_size_y, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
+
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6f7505b7b5c2..f933113fa883 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14750,6 +14750,22 @@ Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
llvm::MDNode::get(CGF.getLLVMContext(), None));
return LD;
}
+
+// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
+Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
+ const unsigned XOffset = 12;
+ auto *DP = EmitAMDGPUDispatchPtr(CGF);
+ // Indexing the HSA kernel_dispatch_packet struct.
+ auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
+ auto *GEP = CGF.Builder.CreateGEP(DP, Offset);
+ auto *DstTy =
+ CGF.Int32Ty->getPointerTo(GEP->getType()->getPointerAddressSpace());
+ auto *Cast = CGF.Builder.CreateBitCast(GEP, DstTy);
+ auto *LD = CGF.Builder.CreateLoad(Address(Cast, CharUnits::fromQuantity(4)));
+ LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+ llvm::MDNode::get(CGF.getLLVMContext(), None));
+ return LD;
+}
} // namespace
// For processing memory ordering and memory scope arguments of various
@@ -15010,6 +15026,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
return EmitAMDGPUWorkGroupSize(*this, 2);
+ // amdgcn grid size
+ case AMDGPU::BI__builtin_amdgcn_grid_size_x:
+ return EmitAMDGPUGridSize(*this, 0);
+ case AMDGPU::BI__builtin_amdgcn_grid_size_y:
+ return EmitAMDGPUGridSize(*this, 1);
+ case AMDGPU::BI__builtin_amdgcn_grid_size_z:
+ return EmitAMDGPUGridSize(*this, 2);
+
// r600 intrinsics
case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 56c83df6b6b4..20edaf2aae3f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -559,6 +559,24 @@ void test_get_workgroup_size(int d, global int *out)
}
}
+// CHECK-LABEL: @test_get_grid_size(
+// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 12
+// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 16
+// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 20
+// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
+void test_get_grid_size(int d, global int *out)
+{
+ switch (d) {
+ case 0: *out = __builtin_amdgcn_grid_size_x(); break;
+ case 1: *out = __builtin_amdgcn_grid_size_y(); break;
+ case 2: *out = __builtin_amdgcn_grid_size_z(); break;
+ default: *out = 0;
+ }
+}
+
// CHECK-LABEL: @test_fmed3_f32
// CHECK: call float @llvm.amdgcn.fmed3.f32(
void test_fmed3_f32(global float* out, float a, float b, float c)
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 8c53d99b9fb6..9fbdc67b56ab 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -119,12 +119,6 @@ DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
}
namespace {
-DEVICE uint32_t grid_size_x() {
- size_t grid_size_x_offset = 96; // In bits, from AQL kernel dispatch format
- return *(uint32_t *)((char *)__builtin_amdgcn_dispatch_ptr() +
- grid_size_x_offset / 8);
-}
-
DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) {
uint32_t q = n / d;
return q + (n > q * d);
@@ -137,11 +131,11 @@ DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
} // namespace
DEVICE int GetNumberOfBlocksInKernel() {
- return get_grid_dim(grid_size_x(), __builtin_amdgcn_workgroup_size_x());
+ return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x());
}
DEVICE int GetNumberOfThreadsInBlock() {
- return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), grid_size_x(),
+ return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(),
__builtin_amdgcn_workgroup_size_x());
}
More information about the Openmp-commits
mailing list