[Openmp-commits] [openmp] 2186199 - [OpenMP] Cleanup and fixes for ABI agnostic DeviceRTL (#71234)
via Openmp-commits
openmp-commits at lists.llvm.org
Wed Nov 8 21:04:40 PST 2023
Author: Saiyedul Islam
Date: 2023-11-09T10:34:35+05:30
New Revision: 21861991e760e7e845dc1be5b804c950543d699a
URL: https://github.com/llvm/llvm-project/commit/21861991e760e7e845dc1be5b804c950543d699a
DIFF: https://github.com/llvm/llvm-project/commit/21861991e760e7e845dc1be5b804c950543d699a.diff
LOG: [OpenMP] Cleanup and fixes for ABI agnostic DeviceRTL (#71234)
Fixes the DeviceRTL compilation to ensure it is ABI agnostic. Uses
already available global variable "oclc_ABI_version" instead of
"llvm.amdgcn.abi.verion".
It also adds some minor fields in ImplicitArg structure.
Added:
Modified:
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/CodeGen/Targets/AMDGPU.cpp
clang/test/CodeGen/amdgpu-abi-version.c
clang/test/CodeGen/amdgpu-address-spaces.cpp
clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e7e498e8a933131..d49c44dbaace3a8 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17432,11 +17432,11 @@ Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
/// Emit code based on Code Object ABI version.
/// COV_4 : Emit code to use dispatch ptr
/// COV_5 : Emit code to use implicitarg ptr
-/// COV_NONE : Emit code to load a global variable "llvm.amdgcn.abi.version"
+/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
/// and use its value for COV_4 or COV_5 approach. It is used for
/// compiling device libraries in an ABI-agnostic way.
///
-/// Note: "llvm.amdgcn.abi.version" is supposed to be emitted and intialized by
+/// Note: "__oclc_ABI_version" is supposed to be emitted and intialized by
/// clang during compilation of user code.
Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
llvm::LoadInst *LD;
@@ -17444,7 +17444,7 @@ Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
if (Cov == clang::TargetOptions::COV_None) {
- StringRef Name = "llvm.amdgcn.abi.version";
+ StringRef Name = "__oclc_ABI_version";
auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
if (!ABIVersionC)
ABIVersionC = new llvm::GlobalVariable(
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0411846cf9b02bd..4dd25213dda9fa5 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -362,11 +362,15 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
/// AMDGPU ROCm device libraries.
void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
CodeGen::CodeGenModule &CGM) const {
- StringRef Name = "llvm.amdgcn.abi.version";
+ StringRef Name = "__oclc_ABI_version";
llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
return;
+ if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
+ clang::TargetOptions::COV_None)
+ return;
+
auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
llvm::Constant *COV = llvm::ConstantInt::get(
Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
diff --git a/clang/test/CodeGen/amdgpu-abi-version.c b/clang/test/CodeGen/amdgpu-abi-version.c
index d1189545139e2a6..4e5ad87655f2305 100644
--- a/clang/test/CodeGen/amdgpu-abi-version.c
+++ b/clang/test/CodeGen/amdgpu-abi-version.c
@@ -2,14 +2,14 @@
// RUN: %clang_cc1 -cc1 -triple amdgcn-amd-amdhsa -emit-llvm -mcode-object-version=none %s -o - | FileCheck %s
//.
-// CHECK: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 0
+// CHECK: @__oclc_ABI_version = external addrspace(4) global i32
//.
// CHECK-LABEL: define dso_local i32 @foo(
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
-// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) @llvm.amdgcn.abi.version, align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
// CHECK-NEXT: [[TMP1:%.*]] = icmp sge i32 [[TMP0]], 500
// CHECK-NEXT: [[TMP2:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i32 12
diff --git a/clang/test/CodeGen/amdgpu-address-spaces.cpp b/clang/test/CodeGen/amdgpu-address-spaces.cpp
index a9994881eb06228..0a808aa6cc75ed3 100644
--- a/clang/test/CodeGen/amdgpu-address-spaces.cpp
+++ b/clang/test/CodeGen/amdgpu-address-spaces.cpp
@@ -29,7 +29,7 @@ int [[clang::address_space(999)]] bbb = 1234;
// CHECK: @u = addrspace(5) global i32 undef, align 4
// CHECK: @aaa = addrspace(6) global i32 1000, align 4
// CHECK: @bbb = addrspace(999) global i32 1234, align 4
-// CHECK: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
+// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
//.
// CHECK-LABEL: define dso_local amdgpu_kernel void @foo(
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu b/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
index cb3bdd2c4eb947d..663687ae227f234 100644
--- a/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu
@@ -17,9 +17,9 @@
#include "Inputs/cuda.h"
-// LINKED4: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
+// LINKED4: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
// LINKED4-LABEL: bar
-// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
+// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED4: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
@@ -28,7 +28,7 @@
// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
+// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED4: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
@@ -37,7 +37,7 @@
// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
+// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED4: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
@@ -47,9 +47,9 @@
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// LINKED4: "amdgpu_code_object_version", i32 400
-// LINKED5: llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+// LINKED5: __oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
// LINKED5-LABEL: bar
-// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
+// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED5: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
@@ -58,7 +58,7 @@
// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
+// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED5: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
@@ -67,7 +67,7 @@
// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
+// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED5: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
diff --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
index f35c06eaff6982b..282e0a49b9aa10b 100644
--- a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
@@ -33,7 +33,7 @@
// COVNONE-LABEL: test_get_workgroup_size
-// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version
+// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version
// COVNONE: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// COVNONE: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
@@ -42,7 +42,7 @@
// COVNONE: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version
+// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version
// COVNONE: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// COVNONE: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
@@ -51,7 +51,7 @@
// COVNONE: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version
+// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version
// COVNONE: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// COVNONE: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 378cad8f8ca4f15..399a71390a65abe 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3086,6 +3086,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
ImplArgs->BlockCountX = NumBlocks;
+ ImplArgs->BlockCountY = 1;
+ ImplArgs->BlockCountZ = 1;
ImplArgs->GroupSizeX = NumThreads;
ImplArgs->GroupSizeY = 1;
ImplArgs->GroupSizeZ = 1;
More information about the Openmp-commits
mailing list