[clang] 369e26c - [AMDGPU] Add __builtin_amdgcn_workgroup_size_x/y/z
Yaxun Liu via cfe-commits
cfe-commits at lists.llvm.org
Fri Mar 27 22:11:56 PDT 2020
Author: Yaxun (Sam) Liu
Date: 2020-03-28T01:03:20-04:00
New Revision: 369e26ca9e0d9ceb87c70d26e9f13e793ee1ab40
URL: https://github.com/llvm/llvm-project/commit/369e26ca9e0d9ceb87c70d26e9f13e793ee1ab40
DIFF: https://github.com/llvm/llvm-project/commit/369e26ca9e0d9ceb87c70d26e9f13e793ee1ab40.diff
LOG: [AMDGPU] Add __builtin_amdgcn_workgroup_size_x/y/z
The main purpose of introducing these builtins is to add a range
metadata [1, 1025) on the work group size loaded from dispatch
ptr, which cannot be done by source code.
Differential Revision: https://reviews.llvm.org/D76772
Added:
clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/include/clang/Basic/TargetInfo.h
clang/lib/Basic/TargetInfo.cpp
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index a9143ad8292c..e5b256c07a49 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -33,6 +33,10 @@ BUILTIN(__builtin_amdgcn_workitem_id_x, "Ui", "nc")
BUILTIN(__builtin_amdgcn_workitem_id_y, "Ui", "nc")
BUILTIN(__builtin_amdgcn_workitem_id_z, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_workgroup_size_x, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_workgroup_size_y, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_workgroup_size_z, "Ui", "nc")
+
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 81760ec82838..5edfa0e4e0c7 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -212,6 +212,8 @@ class TargetInfo : public virtual TransferrableTargetInfo,
unsigned ARMCDECoprocMask : 8;
+ unsigned MaxOpenCLWorkGroupSize;
+
// TargetInfo Constructor. Default initializes all fields.
TargetInfo(const llvm::Triple &T);
@@ -663,6 +665,8 @@ class TargetInfo : public virtual TransferrableTargetInfo,
/// types for the given target.
unsigned getSimdDefaultAlign() const { return SimdDefaultAlign; }
+ unsigned getMaxOpenCLWorkGroupSize() const { return MaxOpenCLWorkGroupSize; }
+
/// Return the alignment (in bits) of the thrown exception object. This is
/// only meaningful for targets that allocate C++ exceptions in a system
/// runtime, such as those using the Itanium C++ ABI.
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 2330339bedfb..2f1e044bb106 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -133,6 +133,8 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : TargetOpts(), Triple(T) {
// Default to an unknown platform name.
PlatformName = "unknown";
PlatformMinVersion = VersionTuple();
+
+ MaxOpenCLWorkGroupSize = 1024;
}
// Out of line virtual dtor for TargetInfo.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 754d95d1ab81..880fe0e271f5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13407,6 +13407,48 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
}
}
+namespace {
+// If \p E is not null pointer, insert address space cast to match return
+// type of \p E if necessary.
+Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
+ const CallExpr *E = nullptr) {
+ auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
+ auto *Call = CGF.Builder.CreateCall(F);
+ Call->addAttribute(
+ AttributeList::ReturnIndex,
+ Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
+ Call->addAttribute(AttributeList::ReturnIndex,
+ Attribute::getWithAlignment(Call->getContext(), Align(4)));
+ if (!E)
+ return Call;
+ QualType BuiltinRetType = E->getType();
+ auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
+ if (RetTy == Call->getType())
+ return Call;
+ return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
+}
+
+// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
+Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
+ const unsigned XOffset = 4;
+ auto *DP = EmitAMDGPUDispatchPtr(CGF);
+ // Indexing the HSA kernel_dispatch_packet struct.
+ auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 2);
+ auto *GEP = CGF.Builder.CreateGEP(DP, Offset);
+ auto *DstTy =
+ CGF.Int16Ty->getPointerTo(GEP->getType()->getPointerAddressSpace());
+ auto *Cast = CGF.Builder.CreateBitCast(GEP, DstTy);
+ auto *LD = CGF.Builder.CreateLoad(Address(Cast, CharUnits::fromQuantity(2)));
+ llvm::MDBuilder MDHelper(CGF.getLLVMContext());
+ llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
+ APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
+ LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
+ LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+ llvm::MDNode::get(CGF.getLLVMContext(), None));
+ return LD;
+}
+} // namespace
+
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
switch (BuiltinID) {
@@ -13489,21 +13531,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_cosf:
case AMDGPU::BI__builtin_amdgcn_cosh:
return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);
- case AMDGPU::BI__builtin_amdgcn_dispatch_ptr: {
- auto *F = CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
- auto *Call = Builder.CreateCall(F);
- Call->addAttribute(
- AttributeList::ReturnIndex,
- Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
- Call->addAttribute(
- AttributeList::ReturnIndex,
- Attribute::getWithAlignment(Call->getContext(), Align(4)));
- QualType BuiltinRetType = E->getType();
- auto *RetTy = cast<llvm::PointerType>(ConvertType(BuiltinRetType));
- if (RetTy == Call->getType())
- return Call;
- return Builder.CreateAddrSpaceCast(Call, RetTy);
- }
+ case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
+ return EmitAMDGPUDispatchPtr(*this, E);
case AMDGPU::BI__builtin_amdgcn_log_clampf:
return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
case AMDGPU::BI__builtin_amdgcn_ldexp:
@@ -13599,6 +13628,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_workitem_id_z:
return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);
+ // amdgcn workgroup size
+ case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
+ return EmitAMDGPUWorkGroupSize(*this, 0);
+ case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
+ return EmitAMDGPUWorkGroupSize(*this, 1);
+ case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
+ return EmitAMDGPUWorkGroupSize(*this, 2);
+
// r600 intrinsics
case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
diff --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
new file mode 100644
index 000000000000..5928320b89f0
--- /dev/null
+++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// CHECK-LABEL: test_get_workgroup_size
+// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4
+// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 6
+// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 8
+// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
+__device__ void test_get_workgroup_size(int d, int *out)
+{
+ switch (d) {
+ case 0: *out = __builtin_amdgcn_workgroup_size_x(); break;
+ case 1: *out = __builtin_amdgcn_workgroup_size_y(); break;
+ case 2: *out = __builtin_amdgcn_workgroup_size_z(); break;
+ default: *out = 0;
+ }
+}
+
+// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 0aa3e4144c52..9d7916c236c5 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -527,6 +527,24 @@ void test_get_local_id(int d, global int *out)
}
}
+// CHECK-LABEL: @test_get_workgroup_size(
+// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 4
+// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 6
+// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 8
+// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load
+void test_get_workgroup_size(int d, global int *out)
+{
+ switch (d) {
+ case 0: *out = __builtin_amdgcn_workgroup_size_x(); break;
+ case 1: *out = __builtin_amdgcn_workgroup_size_y(); break;
+ case 2: *out = __builtin_amdgcn_workgroup_size_z(); break;
+ default: *out = 0;
+ }
+}
+
// CHECK-LABEL: @test_fmed3_f32
// CHECK: call float @llvm.amdgcn.fmed3.f32(
void test_fmed3_f32(global float* out, float a, float b, float c)
@@ -698,6 +716,7 @@ kernel void test_mqsad_u32_u8(global uint4* out, ulong src0, uint src1, uint4 sr
}
// CHECK-DAG: [[$WI_RANGE]] = !{i32 0, i32 1024}
+// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
// CHECK-DAG: attributes #[[$NOUNWIND_READONLY:[0-9]+]] = { nounwind readonly }
// CHECK-DAG: attributes #[[$READ_EXEC_ATTRS]] = { convergent }
// CHECK-DAG: ![[$EXEC]] = !{!"exec"}
More information about the cfe-commits
mailing list