[clang] [llvm] [OpenMP] Replace nvvm.annotation usage with kernel calling conventions (PR #122320)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 9 09:32:22 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
Author: Alex MacLean (AlexMaclean)
<details>
<summary>Changes</summary>
Specifying a kernel with the `ptx_kernel` or `amdgpu_kernel` calling convention is a more idiomatic and compile-time performant than using the `nvvm.annoation !"kernel"` metadata.
Transition OMPIRBuilder to use calling conventions for PTX kernels and no longer emit `nvvm.annoation`. Update OpenMPOpt to work with kernels specified via calling convention as well as metadata. Update OpenMP tests to use the calling conventions.
---
Patch is 345.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122320.diff
33 Files Affected:
- (modified) clang/test/OpenMP/assumes_include_nvptx.cpp (+2-2)
- (modified) clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp (+1-1)
- (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+3-13)
- (modified) llvm/lib/Transforms/IPO/OpenMPOpt.cpp (+38-18)
- (modified) llvm/test/Transforms/OpenMP/always_inline_device.ll (+7-10)
- (modified) llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll (+1-5)
- (modified) llvm/test/Transforms/OpenMP/barrier_removal.ll (+29-117)
- (modified) llvm/test/Transforms/OpenMP/bug66687.ll (+4-7)
- (modified) llvm/test/Transforms/OpenMP/custom_state_machines.ll (+20-61)
- (modified) llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll (+26-83)
- (modified) llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll (+2-5)
- (modified) llvm/test/Transforms/OpenMP/deduplication_target.ll (+1-3)
- (modified) llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll (+3-10)
- (modified) llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll (+3-10)
- (modified) llvm/test/Transforms/OpenMP/global_constructor.ll (+5-8)
- (modified) llvm/test/Transforms/OpenMP/globalization_remarks.ll (+1-3)
- (modified) llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll (+2-7)
- (modified) llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll (+1-3)
- (modified) llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll (+4-13)
- (modified) llvm/test/Transforms/OpenMP/nested_parallelism.ll (+4-9)
- (modified) llvm/test/Transforms/OpenMP/parallel_level_fold.ll (+3-10)
- (modified) llvm/test/Transforms/OpenMP/remove_globalization.ll (+11-15)
- (modified) llvm/test/Transforms/OpenMP/replace_globalization.ll (+7-14)
- (modified) llvm/test/Transforms/OpenMP/single_threaded_execution.ll (+1-3)
- (modified) llvm/test/Transforms/OpenMP/spmdization.ll (+240-1367)
- (modified) llvm/test/Transforms/OpenMP/spmdization_assumes.ll (+12-15)
- (modified) llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll (-2)
- (modified) llvm/test/Transforms/OpenMP/spmdization_guarding.ll (+51-55)
- (modified) llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll (+15-21)
- (modified) llvm/test/Transforms/OpenMP/spmdization_indirect.ll (+72-89)
- (modified) llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll (+14-21)
- (modified) llvm/test/Transforms/OpenMP/spmdization_remarks.ll (+2-5)
- (modified) llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll (+19-77)
``````````diff
diff --git a/clang/test/OpenMP/assumes_include_nvptx.cpp b/clang/test/OpenMP/assumes_include_nvptx.cpp
index 4577ea4c9c2b5e..c5040989a0e407 100644
--- a/clang/test/OpenMP/assumes_include_nvptx.cpp
+++ b/clang/test/OpenMP/assumes_include_nvptx.cpp
@@ -11,11 +11,11 @@
// TODO: Think about teaching the OMPIRBuilder about default attributes as well so the __kmpc* declarations are annotated.
-// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]]
+// CHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]]
// CHECK: call i32 @__kmpc_target_init(
// CHECK: declare noundef float @_Z3sinf(float noundef) [[attr1:#[0-9]*]]
// CHECK: declare void @__kmpc_target_deinit(
-// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]]
+// CHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]]
// CHECK: %call = call noundef double @_Z3sind(double noundef 0.000000e+00) [[attr2:#[0-9]]]
// CHECK: declare noundef double @_Z3sind(double noundef) [[attr1]]
diff --git a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
index d573f1cd193d64..94ace20826db4d 100644
--- a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
@@ -90,7 +90,7 @@ int foo(int n, double *ptr) {
ptr[0]++;
}
- // TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]])
+ // TCHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]])
// TCHECK: [[DYN_PTR_ADDR:%.+]] = alloca ptr,
// TCHECK: [[PTR_ADDR:%.+]] = alloca ptr,
// TCHECK-NOT: alloca ptr,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 8dbf2aa7e0a243..487f886f9bdbfd 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6404,6 +6404,8 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility);
if (T.isAMDGCN())
OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
+ else if (T.isNVPTX())
+ OutlinedFn->setCallingConv(CallingConv::PTX_Kernel);
}
}
@@ -9077,20 +9079,8 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
if (!Fn)
return;
- Module &M = *(Fn->getParent());
- LLVMContext &Ctx = M.getContext();
-
- // Get "nvvm.annotations" metadata node.
- NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-
- Metadata *MDVals[] = {
- ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
- ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
- // Append metadata to nvvm.annotations.
- MD->addOperand(MDNode::get(Ctx, MDVals));
-
// Add a function attribute for the kernel.
- Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
+ Fn->addFnAttr("kernel");
if (T.isAMDGCN())
Fn->addFnAttr("uniform-work-group-size", "true");
Fn->addFnAttr(Attribute::MustProgress);
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 67585e9c80ef4e..f495840c254d59 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -19,6 +19,7 @@
#include "llvm/Transforms/IPO/OpenMPOpt.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
@@ -36,6 +37,7 @@
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
#include "llvm/IR/Assumptions.h"
#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
@@ -5909,34 +5911,52 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) {
return Fn.hasFnAttribute("kernel");
}
+static bool isKernelCC(Function &F) {
+ switch (F.getCallingConv()) {
+ default:
+ return false;
+ case CallingConv::PTX_Kernel:
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ return true;
+ }
+}
+
KernelSet llvm::omp::getDeviceKernels(Module &M) {
// TODO: Create a more cross-platform way of determining device kernels.
- NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
KernelSet Kernels;
- if (!MD)
- return Kernels;
-
- for (auto *Op : MD->operands()) {
- if (Op->getNumOperands() < 2)
- continue;
- MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
- if (!KindID || KindID->getString() != "kernel")
- continue;
-
- Function *KernelFn =
- mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
- if (!KernelFn)
- continue;
+ DenseSet<const Function *> SeenKernels;
+ auto ProcessKernel = [&](Function &KF) {
+ if (SeenKernels.contains(&KF))
+ return;
+ SeenKernels.insert(&KF);
// We are only interested in OpenMP target regions. Others, such as kernels
// generated by CUDA but linked together, are not interesting to this pass.
- if (isOpenMPKernel(*KernelFn)) {
+ if (isOpenMPKernel(KF)) {
++NumOpenMPTargetRegionKernels;
- Kernels.insert(KernelFn);
+ Kernels.insert(&KF);
} else
++NumNonOpenMPTargetRegionKernels;
- }
+ };
+
+ if (NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"))
+ for (auto *Op : MD->operands()) {
+ if (Op->getNumOperands() < 2)
+ continue;
+ MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
+ if (!KindID || KindID->getString() != "kernel")
+ continue;
+
+ if (auto *KernelFn =
+ mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)))
+ ProcessKernel(*KernelFn);
+ }
+
+ for (Function &F : M)
+ if (isKernelCC(F))
+ ProcessKernel(F);
return Kernels;
}
diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
index 6028ff5278037b..9c5b19f7a6c88c 100644
--- a/llvm/test/Transforms/OpenMP/always_inline_device.ll
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -17,7 +17,7 @@
; CHECK: @G = external global i8
; CHECK: @kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
;.
-define weak void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
; CHECK: Function Attrs: norecurse nounwind
; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4(
; CHECK-NEXT: entry:
@@ -79,12 +79,10 @@ attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-wi
attributes #2 = { convergent }
!omp_offload.info = !{!0}
-!nvvm.annotations = !{!1}
!llvm.module.flags = !{!2, !3, !4, !5, !6}
!llvm.ident = !{!7}
!0 = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
-!1 = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
!2 = !{i32 1, !"wchar_size", i32 4}
!3 = !{i32 7, !"openmp", i32 50}
!4 = !{i32 7, !"openmp-device", i32 50}
@@ -97,11 +95,10 @@ attributes #2 = { convergent }
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
;.
; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
-; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
-; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
;.
diff --git a/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll b/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll
index 9c0416af359d4d..3f4790ee15ac8d 100644
--- a/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll
+++ b/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll
@@ -13,10 +13,6 @@ define linkonce_odr hidden i8 @_ZStplIdESt7complexIT_ERKS2_S4_() local_unnamed_a
ret i8 undef
}
-declare void @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148(i64, i64, i64, ptr, ptr, i64, ptr, ptr, ptr, i64) local_unnamed_addr
+declare ptx_kernel void @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148(i64, i64, i64, ptr, ptr, i64, ptr, ptr, ptr, i64) local_unnamed_addr
declare dso_local fastcc void @__kmpc_for_static_init_8u() unnamed_addr
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index 47a5d5104aa8bd..5b7544b1a79616 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -28,7 +28,7 @@ declare void @llvm.assume(i1)
; CHECK: @G1 = global i32 42
; CHECK: @G2 = addrspace(1) global i32 0
;.
-define void @pos_empty_1(i1 %c) "kernel" {
+define amdgpu_kernel void @pos_empty_1(i1 %c) "kernel" {
; MODULE-LABEL: define {{[^@]+}}@pos_empty_1
; MODULE-SAME: (i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
; MODULE-NEXT: ret void
@@ -45,7 +45,7 @@ define void @pos_empty_1(i1 %c) "kernel" {
call void @llvm.assume(i1 %c)
ret void
}
-define void @pos_empty_2() "kernel" {
+define amdgpu_kernel void @pos_empty_2() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_2
; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
; CHECK-NEXT: ret void
@@ -53,7 +53,7 @@ define void @pos_empty_2() "kernel" {
call void @aligned_barrier()
ret void
}
-define void @pos_empty_3() "kernel" {
+define amdgpu_kernel void @pos_empty_3() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_3
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: ret void
@@ -61,7 +61,7 @@ define void @pos_empty_3() "kernel" {
call void @llvm.nvvm.barrier0()
ret void
}
-define void @pos_empty_4() "kernel" {
+define amdgpu_kernel void @pos_empty_4() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_4
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: ret void
@@ -69,7 +69,7 @@ define void @pos_empty_4() "kernel" {
call i32 @llvm.nvvm.barrier0.and(i32 0)
ret void
}
-define void @pos_empty_5() "kernel" {
+define amdgpu_kernel void @pos_empty_5() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_5
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: ret void
@@ -77,7 +77,7 @@ define void @pos_empty_5() "kernel" {
call i32 @llvm.nvvm.barrier0.or(i32 0)
ret void
}
-define void @pos_empty_6() "kernel" {
+define amdgpu_kernel void @pos_empty_6() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_6
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: ret void
@@ -85,7 +85,7 @@ define void @pos_empty_6() "kernel" {
call i32 @llvm.nvvm.barrier0.popc(i32 0)
ret void
}
-define void @pos_empty_7a() "kernel" {
+define amdgpu_kernel void @pos_empty_7a() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_7a
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @unknown()
@@ -96,7 +96,7 @@ define void @pos_empty_7a() "kernel" {
ret void
}
; FIXME: We should remove the barrier.
-define void @pos_empty_7b() "kernel" {
+define amdgpu_kernel void @pos_empty_7b() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_7b
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @unknown() #[[ATTR5:[0-9]+]]
@@ -109,7 +109,7 @@ define void @pos_empty_7b() "kernel" {
call void @unknown()
ret void
}
-define void @pos_empty_8(i1 %c) "kernel" {
+define amdgpu_kernel void @pos_empty_8(i1 %c) "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_8
; CHECK-SAME: (i1 [[C:%.*]]) #[[ATTR4]] {
; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -126,7 +126,7 @@ t:
f:
ret void
}
-define void @neg_empty_8() "kernel" {
+define amdgpu_kernel void @neg_empty_8() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@neg_empty_8
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @unknown()
@@ -137,7 +137,7 @@ define void @neg_empty_8() "kernel" {
call void @llvm.amdgcn.s.barrier()
ret void
}
-define void @neg_empty_9(i1 %c) "kernel" {
+define amdgpu_kernel void @neg_empty_9(i1 %c) "kernel" {
; CHECK-LABEL: define {{[^@]+}}@neg_empty_9
; CHECK-SAME: (i1 [[C:%.*]]) #[[ATTR4]] {
; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -173,7 +173,7 @@ m:
ret void
}
; FIXME: We should remove the barrier
-define void @pos_empty_10() "kernel" {
+define amdgpu_kernel void @pos_empty_10() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_10
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: br label [[M:%.*]]
@@ -186,7 +186,7 @@ m:
call void @llvm.amdgcn.s.barrier()
ret void
}
-define void @pos_empty_11() "kernel" {
+define amdgpu_kernel void @pos_empty_11() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_11
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: br label [[M:%.*]]
@@ -206,7 +206,7 @@ define void @empty() {
ret void
}
; FIXME: We should remove the barrier in the end but not the first one.
-define void @neg_empty_12(i1 %c) "kernel" {
+define amdgpu_kernel void @neg_empty_12(i1 %c) "kernel" {
; MODULE-LABEL: define {{[^@]+}}@neg_empty_12
; MODULE-SAME: (i1 [[C:%.*]]) #[[ATTR4]] {
; MODULE-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -266,7 +266,7 @@ define void @neg_empty_2() "kernel" {
@GC1 = constant i32 42
@GC2 = addrspace(4) global i32 0
@GPtr4 = addrspace(4) global ptr addrspace(4) null
-define void @pos_constant_loads() "kernel" {
+define amdgpu_kernel void @pos_constant_loads() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_constant_loads
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: [[ARG:%.*]] = load ptr addrspace(4), ptr addrspace(4) @GPtr4, align 8
@@ -296,7 +296,7 @@ define void @pos_constant_loads() "kernel" {
@GS = addrspace(3) global i32 0
@GPtr = global ptr null
; TODO: We could remove some of the barriers due to the lack of write effects.
-define void @neg_loads() "kernel" {
+define amdgpu_kernel void @neg_loads() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@neg_loads
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: [[ARG:%.*]] = load ptr, ptr @GPtr, align 8
@@ -327,7 +327,7 @@ define void @neg_loads() "kernel" {
@PG1 = thread_local global i32 42
@PG2 = addrspace(5) global i32 0
@GPtr5 = global ptr addrspace(5) null
-define void @pos_priv_mem() "kernel" {
+define amdgpu_kernel void @pos_priv_mem() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_priv_mem
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 4
@@ -358,7 +358,7 @@ define void @pos_priv_mem() "kernel" {
}
@G1 = global i32 42
@G2 = addrspace(1) global i32 0
-define void @neg_mem() "kernel" {
+define amdgpu_kernel void @neg_mem() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@neg_mem
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: [[ARG:%.*]] = load ptr, ptr @GPtr, align 8
@@ -388,7 +388,7 @@ define void @neg_mem() "kernel" {
ret void
}
-define void @pos_multiple() "kernel" {
+define amdgpu_kernel void @pos_multiple() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_multiple
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: ret void
@@ -404,7 +404,7 @@ define void @pos_multiple() "kernel" {
ret void
}
-define void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) "kernel" {
+define amdgpu_kernel void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) "kernel" {
; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_1
; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR4]] {
; CHECK-NEXT: br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]]
@@ -461,7 +461,7 @@ m:
ret void
}
-define void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, ptr %p) "kernel" {
+define amdgpu_kernel void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, ptr %p) "kernel" {
; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_2
; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) #[[ATTR4]] {
; CHECK-NEXT: store i32 4, ptr [[P]], align 4
@@ -727,7 +727,7 @@ define internal void @barrier_then_write_then_barrier0(ptr %p) {
call void @aligned_barrier()
ret void
}
-define void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, ptr %p) "kernel" {
+define amdgpu_kernel void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, ptr %p) "kernel" {
; MODULE-LABEL: define {{[^@]+}}@multiple_blocks_functions_kernel_effects_0
; MODULE-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) #[[ATTR4]] {
; MODULE-NEXT: call void @barrier_then_write_then_barrier0(ptr [[P]])
@@ -1040,7 +1040,7 @@ define internal void @callee_barrier() {
call void @aligned_barrier()
ret void
}
-define void @caller_barrier1() "kernel" {
+define amdgpu_kernel void @caller_barrier1() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@caller_barrier1
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @callee_barrier()
@@ -1051,7 +1051,7 @@ define void @caller_barrier1() "kernel" {
call void @aligned_barrier()
ret void
}
-define void @caller_barrier2() "kernel" {
+define amdgpu_kernel void @caller_barrier2() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@caller_barrier2
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @unknown()
@@ -1065,7 +1065,7 @@ define void @caller_barrier2() "kernel" {
ret void
}
-define void @loop_barrier() "kernel" {
+define amdgpu_kernel void @loop_barrier() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@loop_barrier
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: entry:
@@ -1095,7 +1095,7 @@ exit:
ret void
}
-define void @loop_barrier_end_barriers() "kernel" {
+define amdgpu_kernel void @loop_barrier_end_barriers() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: entry:
@@ -1129,7 +1129,7 @@ exit:
ret void
}
-define void @loop_barrier_end_barriers_unknown() "kernel" {
+define amdgpu_kernel void @loop_barrier_end_barriers_unknown() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers_unknown
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: entry:
@@ -1165,7 +1165,7 @@ exit:
ret void
}
-define void @loop_barrier_store() "kernel" {
+define amdgpu_kernel void @loop_barrier_store() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@loop_barrier_store
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: entry:
@@ -1195,7 +1195,7 @@ exit:
ret void
}
-define void @loop_barrier_end_barriers_store() "kernel" {
+define amdgpu_kernel void @loop_barrier_end_barriers_store() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers_store
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: entry:
@@ -1232,37 +1232,7 @@ exit:
}
!llvm.module.flags = !{!16,!15}
-!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14,!17,!18,!19,!20,!21,!22,!23,!24,!25,!26,!27,!28,!29,!30}
-!0 = !{ptr @pos_empty_1, !"kernel", i32 1}
-!1 = !{ptr @pos_empty_2, !"kernel", i32 1}
-!2 = !{ptr @pos_empty_3, !"kernel", i32 1}
-!3 = !{ptr @pos_empty_4, !"kernel", i32 1}
-!4 = !{ptr @pos_empty_5, !"kernel", i32 1}
-!5 = !{ptr @pos_empty_6, !"kernel", i32 1}
-!17 = !{ptr @pos_empty_7a, !"kernel", i32 1}
-!18 = !{ptr @pos_empty_7b, !"kernel", i32 1}
-!23 = !{ptr @pos_empty_8, !"kernel", i32 1}
-!24 = !{ptr @caller_barrier1, !"kernel", i32 1}
-!25 = !{ptr @caller_barrier2, !"kernel", i32 1}
-!26 = !{ptr @loop_barrier, !"kernel", i32 1}
-!27 = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1}
-!28 = !{ptr @loop_barrier_end_barriers_unkno...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/122320
More information about the llvm-commits
mailing list