[clang] 23431b5 - [clang][CodeGen] Fix GPU-specific attributes being dropped by bitcode linking

via cfe-commits cfe-commits at lists.llvm.org
Wed Jun 7 06:51:58 PDT 2023


Author: pvanhout
Date: 2023-06-07T15:51:52+02:00
New Revision: 23431b52460328c554ad244fd7b50ecb751cec31

URL: https://github.com/llvm/llvm-project/commit/23431b52460328c554ad244fd7b50ecb751cec31
DIFF: https://github.com/llvm/llvm-project/commit/23431b52460328c554ad244fd7b50ecb751cec31.diff

LOG: [clang][CodeGen] Fix GPU-specific attributes being dropped by bitcode linking

Device libs make use of patterns like this:
```
__attribute__((target("gfx11-insts")))
static unsigned do_intrin_stuff(void)
{
  return __builtin_amdgcn_s_sendmsg_rtnl(0x0);
}
```
For functions that are assumed to be eliminated if the currennt GPU target doesn't support them.
At O0 such functions aren't eliminated by common optimizations but often by AMDGPURemoveIncompatibleFunctions instead, which sees the "+gfx11-insts" attribute on, say, GFX9 and knows it's not valid, so it removes the function.

D142907 accidentally made it so such attributes were dropped during bitcode linking, making it impossible for RemoveIncompatibleFunctions to catch the functions and causing ISel to catch fire eventually.

This fixes the issue and adds a new test to ensure we don't accidentally fall into this trap again.

Fixes SWDEV-403642

Reviewed By: arsenm, yaxunl

Differential Revision: https://reviews.llvm.org/D152251

Added: 
    clang/test/CodeGenCUDA/Inputs/ocml-sample-target-attrs.cl
    clang/test/CodeGenCUDA/link-builtin-bitcode-gpu-attrs-preserved.cu

Modified: 
    clang/lib/CodeGen/CGCall.cpp
    clang/lib/CodeGen/CodeGenModule.cpp
    clang/lib/CodeGen/CodeGenModule.h
    clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index b6b04fca66820..eb45e82fe8256 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2025,7 +2025,8 @@ void CodeGenModule::mergeDefaultFunctionDefinitionAttributes(
   llvm::AttrBuilder FuncAttrs(F.getContext());
   getTrivialDefaultFunctionAttributes(F.getName(), F.hasOptNone(),
                                       /*AttrOnCallSite=*/false, FuncAttrs);
-  GetCPUAndFeaturesAttributes(GlobalDecl(), FuncAttrs);
+  GetCPUAndFeaturesAttributes(GlobalDecl(), FuncAttrs,
+                              /*AddTargetFeatures=*/false);
 
   if (!WillInternalize && F.isInterposable()) {
     // Do not promote "dynamic" denormal-fp-math to this translation unit's

diff  --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 49b670487749a..41ceebfba404a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2226,7 +2226,8 @@ void CodeGenModule::SetCommonAttributes(GlobalDecl GD, llvm::GlobalValue *GV) {
 }
 
 bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
-                                                llvm::AttrBuilder &Attrs) {
+                                                llvm::AttrBuilder &Attrs,
+                                                bool SetTargetFeatures) {
   // Add target-cpu and target-features attributes to functions. If
   // we have a decl for the function and it has a target attribute then
   // parse that and add it to the feature set.
@@ -2286,7 +2287,7 @@ bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
     Attrs.addAttribute("tune-cpu", TuneCPU);
     AddedAttr = true;
   }
-  if (!Features.empty()) {
+  if (!Features.empty() && SetTargetFeatures) {
     llvm::sort(Features);
     Attrs.addAttribute("target-features", llvm::join(Features, ","));
     AddedAttr = true;

diff  --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index e4ef209195275..43d61b40f76b4 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1583,7 +1583,8 @@ class CodeGenModule : public CodeGenTypeCache {
                         ForDefinition_t IsForDefinition = NotForDefinition);
 
   bool GetCPUAndFeaturesAttributes(GlobalDecl GD,
-                                   llvm::AttrBuilder &AttrBuilder);
+                                   llvm::AttrBuilder &AttrBuilder,
+                                   bool SetTargetFeatures = true);
   void setNonAliasAttributes(GlobalDecl GD, llvm::GlobalObject *GO);
 
   /// Set function attributes for a function declaration.

diff  --git a/clang/test/CodeGenCUDA/Inputs/ocml-sample-target-attrs.cl b/clang/test/CodeGenCUDA/Inputs/ocml-sample-target-attrs.cl
new file mode 100644
index 0000000000000..b26595da4e9af
--- /dev/null
+++ b/clang/test/CodeGenCUDA/Inputs/ocml-sample-target-attrs.cl
@@ -0,0 +1,7 @@
+typedef unsigned long ulong;
+
+__attribute__((target("gfx11-insts")))
+ulong do_intrin_stuff(void)
+{
+  return __builtin_amdgcn_s_sendmsg_rtnl(0x0);
+}

diff  --git a/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu b/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu
index 910d2470315f1..f12eacfb53fab 100644
--- a/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu
+++ b/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu
@@ -132,24 +132,32 @@ __global__ void kernel_f64(double* out, double* a, double* b, double* c) {
 
 // Default mode relies on the implicit check-not for the denormal-fp-math.
 
-// PSZ: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
-// PSZ: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
-// PSZ: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
+// PSZ: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign"
+// PSZ-SAME: "target-cpu"="gfx803"
+// PSZ: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
+// PSZ-SAME: "target-cpu"="gfx803"
+// PSZ: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
+// PSZ-SAME: "target-cpu"="gfx803"
 
 // FIXME: Should check-not "denormal-fp-math" within the line
-// IEEEF64-PSZF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
-// IEEEF64-PSZF32: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
-// IEEEF64-PSZF32: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
+// IEEEF64-PSZF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
+// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"
+// IEEEF64-PSZF32: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
+// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"
+// IEEEF64-PSZF32: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
+// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"
 
 // IEEEF32-PSZF64-DYNF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}}  }
 // implicit check-not
 // implicit check-not
 
 
-// IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} }
-// IEEEF32-PSZF64-DYNFULL: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}}  }
-// IEEEF32-PSZF64-DYNFULL: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"  {{.*}} "target-cpu"="gfx803" {{.*}} }
-
+// IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
+// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"
+// IEEEF32-PSZF64-DYNFULL: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
+// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"
+// IEEEF32-PSZF64-DYNFULL: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
+// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"
 
 // -mlink-bitcode-file doesn't internalize or propagate attributes.
 // NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} }

diff  --git a/clang/test/CodeGenCUDA/link-builtin-bitcode-gpu-attrs-preserved.cu b/clang/test/CodeGenCUDA/link-builtin-bitcode-gpu-attrs-preserved.cu
new file mode 100644
index 0000000000000..a2c8faec47586
--- /dev/null
+++ b/clang/test/CodeGenCUDA/link-builtin-bitcode-gpu-attrs-preserved.cu
@@ -0,0 +1,48 @@
+// Verify the behavior of the +gfxN-insts in the way that
+// rocm-device-libs should be built with. e.g. If the device libraries has a function
+// with "+gfx11-insts", that attribute should still be present after linking and not
+// overwritten with the current target's settings.
+
+// This is important because at this time, many device-libs functions that are only
+// available on some GPUs put an attribute such as "+gfx11-insts" so that
+// AMDGPURemoveIncompatibleFunctions can detect & remove them if needed.
+
+// Build the fake device library in the way rocm-device-libs should be built.
+//
+// RUN: %clang_cc1 -x cl -triple amdgcn-amd-amdhsa\
+// RUN:   -mcode-object-version=none -emit-llvm-bc \
+// RUN:   %S/Inputs/ocml-sample-target-attrs.cl -o %t.bc
+
+// Check the default behavior
+// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 -fcuda-is-device \
+// RUN:   -mlink-builtin-bitcode %t.bc \
+// RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,INTERNALIZE
+
+// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx1101 -fcuda-is-device \
+// RUN:   -mlink-builtin-bitcode %t.bc -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,INTERNALIZE
+
+// Check the case where no internalization is performed
+// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
+// RUN:   -fcuda-is-device -mlink-bitcode-file %t.bc -emit-llvm %s -o -  | FileCheck %s --check-prefixes=CHECK,NOINTERNALIZE
+
+// Check the case where no internalization is performed
+// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx1101 \
+// RUN:   -fcuda-is-device -mlink-bitcode-file %t.bc -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,NOINTERNALIZE
+
+
+// CHECK: define {{.*}} i64 @do_intrin_stuff() #[[ATTR:[0-9]+]]
+// INTERNALIZE: attributes #[[ATTR]] = {{.*}} "target-cpu"="gfx{{.*}}" "target-features"="+gfx11-insts"
+// NOINTERNALIZE: attributes #[[ATTR]] = {{.*}} "target-features"="+gfx11-insts"
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+typedef unsigned long ulong;
+
+extern "C" {
+__device__ ulong do_intrin_stuff(void);
+
+__global__ void kernel_f16(ulong* out) {
+    *out = do_intrin_stuff();
+  }
+}


        


More information about the cfe-commits mailing list