[clang] [Clang][OpenCL][AMDGPU] OpenCL Kernel stubs should be assigned alwaysinline attribute (PR #137769)
Aniket Lal via cfe-commits
cfe-commits at lists.llvm.org
Wed Apr 30 04:20:34 PDT 2025
https://github.com/lalaniket8 updated https://github.com/llvm/llvm-project/pull/137769
>From 8fd3b0cafa7ddd260d073232f93e262a6d508d52 Mon Sep 17 00:00:00 2001
From: anikelal <anikelal at amd.com>
Date: Tue, 29 Apr 2025 13:59:48 +0530
Subject: [PATCH] add alwaysinline attribute to stubs
---
clang/lib/CodeGen/CodeGenModule.cpp | 13 ++++++++++
.../CodeGenOpenCL/amdgpu-enqueue-kernel.cl | 6 ++---
.../test/CodeGenOpenCL/cl-uniform-wg-size.cl | 4 ----
.../CodeGenOpenCL/cl20-device-side-enqueue.cl | 24 +++++++------------
clang/test/CodeGenOpenCL/convergent.cl | 9 +++----
.../enqueue-kernel-non-entry-block.cl | 11 ++-------
6 files changed, 32 insertions(+), 35 deletions(-)
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e917f3c42da06..2daeb6dbc751a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6174,6 +6174,19 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
CodeGenFunction(*this).GenerateCode(GD, Fn, FI);
setNonAliasAttributes(GD, Fn);
+
+ bool ShouldAddOptNone = !CodeGenOpts.DisableO0ImplyOptNone &&
+ (CodeGenOpts.OptimizationLevel == 0) &&
+ !D->hasAttr<MinSizeAttr>();
+
+ if (D->hasAttr<OpenCLKernelAttr>())
+ if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub &&
+ !D->hasAttr<NoInlineAttr>() &&
+ !Fn->hasFnAttribute(llvm::Attribute::NoInline) &&
+ !D->hasAttr<OptimizeNoneAttr>() &&
+ !Fn->hasFnAttribute(llvm::Attribute::OptimizeNone) && !ShouldAddOptNone)
+ Fn->addFnAttr(llvm::Attribute::AlwaysInline);
+
SetLLVMFunctionAttributesForDefinition(D, Fn);
if (const ConstructorAttr *CA = D->getAttr<ConstructorAttr>())
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index 773daf53b2746..a0e11a1b5997e 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -492,7 +492,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: ret void
//
//
-// GFX900: Function Attrs: convergent norecurse nounwind
+// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test(
// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
// GFX900-NEXT: [[ENTRY:.*:]]
@@ -640,7 +640,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: ret void
//
//
-// GFX900: Function Attrs: convergent norecurse nounwind
+// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel(
// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] {
// GFX900-NEXT: [[ENTRY:.*:]]
@@ -832,7 +832,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" }
// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" }
-// GFX900: attributes #[[ATTR3]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
+// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
// GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
// GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
// GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
diff --git a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl
index 5f32231b18c3d..98587c694619f 100644
--- a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl
+++ b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl
@@ -5,7 +5,6 @@
kernel void ker() {};
// CHECK: define{{.*}}@ker() #[[ATTR0:[0-9]+]]
-// CHECK: call void @__clang_ocl_kern_imp_ker() #[[ATTR2:[0-9]+]]
// CHECK: define{{.*}}@__clang_ocl_kern_imp_ker() #[[ATTR1:[0-9]+]]
@@ -18,6 +17,3 @@ void foo() {};
// CHECK: attributes #[[ATTR1]]
// CHECK-NOT: uniform-work-group-size
-
-// CHECK: attributes #[[ATTR2]]
-// CHECK-NOT: uniform-work-group-size
diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
index 3355fe1c25819..6c85e734c0eb4 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR,TRIPLESPIR
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR,TRIPLESPIR
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLESPIR
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR,TRIPLESPIR
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR,TRIPLESPIR
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLESPIR
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86,TRIPLEX86
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86,TRIPLEX86
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLEX86
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefix=CHECK-LIFETIMES
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
@@ -39,12 +39,6 @@ void callee(int id, __global int *out) {
out[id] = id;
}
-// TRIPLESPIR: define{{.*}} void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i)
-// TRIPLESPIR: call spir_func void @__clang_ocl_kern_imp_device_side_enqueue({{.*}})
-
-// TRIPLEX86: define{{.*}} void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i)
-// TRIPLEX86: call void @__clang_ocl_kern_imp_device_side_enqueue({{.*}})
-
// COMMON-LABEL: define{{.*}} void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i)
kernel void device_side_enqueue(global int *a, global int *b, int i) {
// SPIR: %default_queue = alloca target("spirv.Queue")
diff --git a/clang/test/CodeGenOpenCL/convergent.cl b/clang/test/CodeGenOpenCL/convergent.cl
index 123adba7b40d2..53a35a4f73119 100644
--- a/clang/test/CodeGenOpenCL/convergent.cl
+++ b/clang/test/CodeGenOpenCL/convergent.cl
@@ -127,7 +127,7 @@ void test_not_unroll() {
// CHECK: declare spir_func void @nodupfun(){{[^#]*}} #[[attr3:[0-9]+]]
// CHECK-LABEL: @assume_convergent_asm
-// CHECK: tail call void asm sideeffect "s_barrier", ""() #5
+// CHECK: tail call void asm sideeffect "s_barrier", ""() #6
kernel void assume_convergent_asm()
{
__asm__ volatile("s_barrier");
@@ -138,6 +138,7 @@ kernel void assume_convergent_asm()
// CHECK: attributes #2 = { {{[^}]*}}convergent{{[^}]*}} }
// CHECK: attributes #3 = { {{[^}]*}}convergent noduplicate{{[^}]*}} }
// CHECK: attributes #4 = { {{[^}]*}}convergent{{[^}]*}} }
-// CHECK: attributes #5 = { {{[^}]*}}convergent{{[^}]*}} }
-// CHECK: attributes #6 = { {{[^}]*}}nounwind{{[^}]*}} }
-// CHECK: attributes #7 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} }
+// CHECK: attributes #5 = { {{[^}]*}}alwaysinline convergent{{[^}]*}} }
+// CHECK: attributes #6 = { {{[^}]*}}convergent{{[^}]*}} }
+// CHECK: attributes #7 = { {{[^}]*}}nounwind{{[^}]*}} }
+// CHECK: attributes #8 = { {{[^}]*}}convergent noduplicate nounwind{{[^}]*}} }
diff --git a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
index e741cf63f30b5..8e970f121bca8 100644
--- a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
+++ b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
@@ -9,15 +9,8 @@
typedef struct {int a;} ndrange_t;
kernel void test(int i) {
-
// AMDGPU-LABEL: define {{.*}} amdgpu_kernel void @test
-// AMDGPU-LABEL: call void @__clang_ocl_kern_imp_test(i32 noundef %0)
-
// SPIR-LABEL: define {{.*}} spir_kernel void @test
-// SPIR-LABEL: call spir_func void @__clang_ocl_kern_imp_test(i32 noundef %0)
-
-// AMDGPU-LABEL: define {{.*}} void @__clang_ocl_kern_imp_test
-// SPIR-LABEL: define {{.*}} spir_func void @__clang_ocl_kern_imp_test
// COMMON-LABEL: entry:
// AMDGPU: %block_sizes = alloca [1 x i64]
@@ -44,5 +37,5 @@ kernel void test(int i) {
// CHECK-DEBUG: ![[TESTFILE:[0-9]+]] = !DIFile(filename: "<stdin>"
// CHECK-DEBUG: ![[TESTSCOPE:[0-9]+]] = distinct !DISubprogram(name: "test", linkageName: "__clang_ocl_kern_imp_test", {{.*}} file: ![[TESTFILE]]
-// CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 33)
-// CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 34, scope: ![[IFSCOPE]])
+// CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 26)
+// CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 27, scope: ![[IFSCOPE]])
More information about the cfe-commits
mailing list