[clang] [Clang] Define AMDGPU ABI when referenced in CodeGen for ABI "none" (PR #66162)

Tue Sep 12 19:10:40 PDT 2023

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/66162:

>From afe74583e6f9a5bd785427204866367f3eac8620 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6 at vols.utk.edu>
Date: Tue, 12 Sep 2023 19:22:30 -0500
Subject: [PATCH] [Clang] Define AMDGPU ABI when referenced in CodeGen for ABI
 "none"

Summary:
We use the 1llvm.amgcn.abi.version` varaible to control code generation.
This is emitted in every module now to indicate what should be used when
compiling. Previously, the logic caused us to emit an external reference
to this variable when creating the code for the `none` type. This would
then cause us not to emit the actual definition. This patch refines the
logic to create the external reference, and then update it if it is
found unset by the time we emit the global. I had to remove the
reference to `GetOrCreateLLVmGlobal` because it did not accept the
proper address space.
---
 clang/lib/CodeGen/CGBuiltin.cpp               | 11 +++++--
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 10 +++++-
 clang/test/CodeGen/amdgpu-abi-version.c       | 32 +++++++++++++++++++
 .../test/CodeGenCUDA/amdgpu-workgroup-size.cu |  6 ++--
 4 files changed, 52 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/CodeGen/amdgpu-abi-version.c

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 8b19bf85d47a19f..fcda6e67934330a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17188,9 +17188,14 @@ Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
   auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
 
   if (Cov == clang::TargetOptions::COV_None) {
-    auto *ABIVersionC = CGF.CGM.GetOrCreateLLVMGlobal(
-        "llvm.amdgcn.abi.version", CGF.Int32Ty, LangAS::Default, nullptr,
-        CodeGen::NotForDefinition);
+    StringRef Name = "llvm.amdgcn.abi.version";
+    auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
+    if (!ABIVersionC)
+      ABIVersionC = new llvm::GlobalVariable(
+          CGF.CGM.getModule(), CGF.Int32Ty, false,
+          llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
+          llvm::GlobalVariable::NotThreadLocal,
+          CGF.CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
 
     // This load will be eliminated by the IPSCCP because it is constant
     // weak_odr without externally_initialized. Either changing it to weak or
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index c168bd4b7c7cc15..7ed720f8dd2abb1 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -362,7 +362,8 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
     CodeGen::CodeGenModule &CGM) const {
   StringRef Name = "llvm.amdgcn.abi.version";
-  if (CGM.getModule().getNamedGlobal(Name))
+  llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
+  if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
     return;
 
   auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
@@ -377,6 +378,13 @@ void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
       CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
   GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
   GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
+
+  // Replace any external references to this variable with the new global.
+  if (OriginalGV) {
+    OriginalGV->replaceAllUsesWith(GV);
+    GV->takeName(OriginalGV);
+    OriginalGV->eraseFromParent();
+  }
 }
 
 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/test/CodeGen/amdgpu-abi-version.c b/clang/test/CodeGen/amdgpu-abi-version.c
new file mode 100644
index 000000000000000..1cec45fb213057c
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-abi-version.c
@@ -0,0 +1,32 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 3
+// RUN: %clang_cc1 -cc1 -triple amdgcn-amd-amdhsa -emit-llvm -mcode-object-version=none %s -o - | FileCheck %s
+
+//.
+// CHECK: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 0
+//.
+// CHECK-LABEL: define dso_local i32 @foo(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) @llvm.amdgcn.abi.version, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp sge i32 [[TMP0]], 500
+// CHECK-NEXT:    [[TMP2:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i32 12
+// CHECK-NEXT:    [[TMP4:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP4]], i32 4
+// CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP1]], ptr addrspace(4) [[TMP3]], ptr addrspace(4) [[TMP5]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(4) [[TMP6]], align 2, !range [[RNG2:![0-9]+]], !invariant.load !3, !noundef !3
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP7]] to i32
+// CHECK-NEXT:    ret i32 [[CONV]]
+//
+int foo() { return __builtin_amdgcn_workgroup_size_x(); }
+//.
+// CHECK: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+//.
+// CHECK: !0 = !{i32 1, !"wchar_size", i32 4}
+// CHECK: !1 = !{!"clang version 18.0.0"}
+// CHECK: !2 = !{i16 1, i16 1025}
+// CHECK: !3 = !{}
+//.
diff --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
index c661b06d57b78d7..f35c06eaff6982b 100644
--- a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
@@ -33,7 +33,7 @@
 
 
 // COVNONE-LABEL: test_get_workgroup_size
-// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}}
+// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version
 // COVNONE: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500
 // COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 // COVNONE: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
@@ -42,7 +42,7 @@
 // COVNONE: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
 // COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
 
-// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}}
+// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version
 // COVNONE: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500
 // COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 // COVNONE: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
@@ -51,7 +51,7 @@
 // COVNONE: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
 // COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
 
-// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}}
+// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version
 // COVNONE: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500
 // COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 // COVNONE: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16