[llvm] pr/amdgpu closed world (PR #66488)

Fri Sep 15 03:43:35 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu
            
<details>
<summary>Changes</summary>
None
--

Patch is 241.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66488.diff

14 Files Affected:

- (modified) llvm/include/llvm/Transforms/IPO/Attributor.h (+5-4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (+32-1) 
- (modified) llvm/lib/Transforms/IPO/Attributor.cpp (+1-1) 
- (modified) llvm/lib/Transforms/IPO/AttributorAttributes.cpp (+2-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll (+22-14) 
- (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll (+74-25) 
- (modified) llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll (+43-22) 
- (modified) llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll (+30-14) 
- (modified) llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll (+22-10) 
- (modified) llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll (+17-9) 
- (modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+2189-1820) 
- (modified) llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll (+44-21) 
- (modified) llvm/test/CodeGen/AMDGPU/sibling-call.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll (+36-12) 


<pre>

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index bd1bd8261123e51..f266620b65ca1fe 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1435,7 +1435,7 @@ struct AttributorConfig {
   /// Callback function to determine if an indirect call targets should be made
   /// direct call targets (with an if-cascade).
   std::function&lt;bool(Attributor &amp;A, const AbstractAttribute &amp;AA, CallBase &amp;CB,
-                     Function &amp;AssummedCallee)&gt;
+                     Function &amp;AssummedCallee, unsigned NumCallees)&gt;
       IndirectCalleeSpecializationCallback = nullptr;
 
   /// Helper to update an underlying call graph and to delete functions.
@@ -1717,10 +1717,11 @@ struct Attributor {
   /// Return true if we should specialize the call site \b CB for the potential
   /// callee \p Fn.
   bool shouldSpecializeCallSiteForCallee(const AbstractAttribute &amp;AA,
-                                         CallBase &amp;CB, Function &amp;Callee) {
+                                         CallBase &amp;CB, Function &amp;Callee,
+                                         unsigned NumCallees) {
     return Configuration.IndirectCalleeSpecializationCallback
-               ? Configuration.IndirectCalleeSpecializationCallback(*this, AA,
-                                                                    CB, Callee)
+               ? Configuration.IndirectCalleeSpecializationCallback(
+                     *this, AA, CB, Callee, NumCallees)
                : true;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 57c873f00a4a195..fb203c9e4006426 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -14,11 +14,15 @@
 #include &quot;GCNSubtarget.h&quot;
 #include &quot;Utils/AMDGPUBaseInfo.h&quot;
 #include &quot;llvm/Analysis/CycleAnalysis.h&quot;
+#include &quot;llvm/Analysis/TargetTransformInfo.h&quot;
 #include &quot;llvm/CodeGen/TargetPassConfig.h&quot;
+#include &quot;llvm/IR/CallingConv.h&quot;
 #include &quot;llvm/IR/IntrinsicsAMDGPU.h&quot;
 #include &quot;llvm/IR/IntrinsicsR600.h&quot;
+#include &quot;llvm/Support/Casting.h&quot;
 #include &quot;llvm/Target/TargetMachine.h&quot;
 #include &quot;llvm/Transforms/IPO/Attributor.h&quot;
+#include &lt;optional&gt;
 
 #define DEBUG_TYPE &quot;amdgpu-attributor&quot;
 
@@ -944,16 +948,29 @@ class AMDGPUAttributor : public ModulePass {
         {&amp;AAAMDAttributes::ID, &amp;AAUniformWorkGroupSize::ID,
          &amp;AAPotentialValues::ID, &amp;AAAMDFlatWorkGroupSize::ID,
          &amp;AAAMDWavesPerEU::ID, &amp;AACallEdges::ID, &amp;AAPointerInfo::ID,
-         &amp;AAPotentialConstantValues::ID, &amp;AAUnderlyingObjects::ID});
+         &amp;AAIndirectCallInfo::ID, &amp;AAPotentialConstantValues::ID,
+         &amp;AAUnderlyingObjects::ID});
 
     AttributorConfig AC(CGUpdater);
     AC.Allowed = &amp;Allowed;
     AC.IsModulePass = true;
     AC.DefaultInitializeLiveInternals = false;
+    AC.IsClosedWorldModule = true;
     AC.IPOAmendableCB = [](const Function &amp;F) {
       return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
     };
 
+    // Callback to determine if we should specialize a indirect call site with a
+    // specific callee. It&#x27;s effectively a heuristic and we can add checks for
+    // the callee size, PGO, etc. For now, we check for single potential callees
+    // and kernel arguments as they are known uniform values.
+    AC.IndirectCalleeSpecializationCallback =
+        [&amp;](Attributor &amp;A, const AbstractAttribute &amp;AA, CallBase &amp;CB,
+            Function &amp;Callee, unsigned NumCallees) {
+          return indirectCalleeSpecializationCallback(A, AA, CB, Callee,
+                                                      NumCallees);
+        };
+
     Attributor A(Functions, InfoCache, AC);
 
     for (Function &amp;F : M) {
@@ -975,6 +992,20 @@ class AMDGPUAttributor : public ModulePass {
     AU.addRequired&lt;CycleInfoWrapperPass&gt;();
   }
 
+  /// Helper to decide if we should specialize the indirect \p CB for \p Callee,
+  /// which is one of the \p NumCallees potential callees.
+  bool indirectCalleeSpecializationCallback(Attributor &amp;A,
+                                            const AbstractAttribute &amp;AA,
+                                            CallBase &amp;CB, Function &amp;Callee,
+                                            unsigned NumCallees) {
+    // Singleton functions should be specialized.
+    if (NumCallees == 1)
+      return true;
+    // Otherewise specialize uniform values.
+    const auto &amp;TTI = TM-&gt;getTargetTransformInfo(*CB.getCaller());
+    return TTI.isAlwaysUniform(CB.getCalledOperand());
+  }
+
   StringRef getPassName() const override { return &quot;AMDGPU Attributor&quot;; }
   TargetMachine *TM;
   static char ID;
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 1ffafc65ba63a4f..5b5a9a28f6d3838 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -3821,7 +3821,7 @@ static bool runAttributorOnFunctions(InformationCache &amp;InfoCache,
   if (MaxSpecializationPerCB.getNumOccurrences()) {
     AC.IndirectCalleeSpecializationCallback =
         [&amp;](Attributor &amp;, const AbstractAttribute &amp;AA, CallBase &amp;CB,
-            Function &amp;Callee) {
+            Function &amp;Callee, unsigned NumCallees) {
           if (MaxSpecializationPerCB == 0)
             return false;
           auto &amp;Set = IndirectCalleeTrackingMap[&amp;CB];
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 03b5dc3899ac8f8..86c6bb04368e241 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12352,7 +12352,8 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
     SmallVector&lt;Function *, 8&gt; SkippedAssumedCallees;
     SmallVector&lt;std::pair&lt;CallInst *, Instruction *&gt;&gt; NewCalls;
     for (Function *NewCallee : AssumedCallees) {
-      if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee)) {
+      if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee,
+                                               AssumedCallees.size())) {
         SkippedAssumedCallees.push_back(NewCallee);
         SpecializedForAllCallees = false;
         continue;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
index 4eba84f61c2d8a3..0bb9a58c29ae718 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -stop-after=irtranslator -attributor-assume-closed-world=false -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope --check-prefixes=SAMEC,CHECK %s
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope --check-prefixes=SAMEC,CWRLD %s
 
 define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
   ; CHECK-LABEL: name: test_indirect_call_sgpr_ptr
@@ -52,24 +53,31 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[LOAD]](p0), 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_ENDPGM 0
+  ;
+  ; CWRLD-LABEL: name: test_indirect_call_sgpr_ptr
+  ; CWRLD: bb.1 (%ir-block.0):
+  ; CWRLD-NEXT:   liveins: $sgpr4_sgpr5
+  ; CWRLD-NEXT: {{  $}}
+  ; CWRLD-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; CWRLD-NEXT:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
   call void %fptr()
   ret void
 }
 
 define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(ptr %fptr) {
-  ; CHECK-LABEL: name: test_gfx_indirect_call_sgpr_ptr
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(&lt;4 x s32&gt;) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](&lt;4 x s32&gt;)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; SAMEC-LABEL: name: test_gfx_indirect_call_sgpr_ptr
+  ; SAMEC: bb.1 (%ir-block.0):
+  ; SAMEC-NEXT:   liveins: $vgpr0, $vgpr1
+  ; SAMEC-NEXT: {{  $}}
+  ; SAMEC-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; SAMEC-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; SAMEC-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; SAMEC-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; SAMEC-NEXT:   [[COPY2:%[0-9]+]]:_(&lt;4 x s32&gt;) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; SAMEC-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](&lt;4 x s32&gt;)
+  ; SAMEC-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; SAMEC-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; SAMEC-NEXT:   SI_RETURN
   call amdgpu_gfx void %fptr()
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index c8ac5bcf8d22242..35affc7a8b140be 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features &lt; %s | FileCheck -check-prefixes=AKF_HSA %s
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor &lt; %s | FileCheck -check-prefixes=ATTRIBUTOR_HSA %s
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor -attributor-assume-closed-world=false &lt; %s | FileCheck -check-prefixes=ATTRIBUTOR_HSA,OWRLD_ATTR_HSA %s
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor &lt; %s | FileCheck -check-prefixes=ATTRIBUTOR_HSA,CWRLD_ATTR_HSA %s
 
 ; TODO: The test contains UB which is refined by the Attributor and should be removed.
 
@@ -18,6 +19,16 @@ declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #0
 declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0
 declare i64 @llvm.amdgcn.dispatch.id() #0
 
+ at G1 = global ptr poison
+ at G2 = global ptr poison
+
+;.
+; AKF_HSA: @[[G1:[a-zA-Z0-9_$&quot;\\.-]+]] = global ptr poison
+; AKF_HSA: @[[G2:[a-zA-Z0-9_$&quot;\\.-]+]] = global ptr poison
+;.
+; ATTRIBUTOR_HSA: @[[G1:[a-zA-Z0-9_$&quot;\\.-]+]] = global ptr poison
+; ATTRIBUTOR_HSA: @[[G2:[a-zA-Z0-9_$&quot;\\.-]+]] = global ptr poison
+;.
 define void @use_workitem_id_x() #1 {
 ; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x
 ; AKF_HSA-SAME: () #[[ATTR1:[0-9]+]] {
@@ -766,19 +777,55 @@ define float @func_indirect_call(ptr %fptr) #3 {
 ; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] {
 ; AKF_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]]()
 ; AKF_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; AKF_HSA-NEXT:    store ptr @indirect_callee1, ptr @G1, align 8
+; AKF_HSA-NEXT:    store ptr @indirect_callee2, ptr @G2, align 8
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
-; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
-; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]]()
-; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
+; OWRLD_ATTR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
+; OWRLD_ATTR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
+; OWRLD_ATTR_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]]()
+; OWRLD_ATTR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; OWRLD_ATTR_HSA-NEXT:    store ptr @indirect_callee1, ptr @G1, align 8
+; OWRLD_ATTR_HSA-NEXT:    store ptr @indirect_callee2, ptr @G2, align 8
+; OWRLD_ATTR_HSA-NEXT:    ret float [[FADD]]
+;
+; CWRLD_ATTR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
+; CWRLD_ATTR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR17]] {
+; CWRLD_ATTR_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]](), !callees !0
+; CWRLD_ATTR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; CWRLD_ATTR_HSA-NEXT:    store ptr @indirect_callee1, ptr @G1, align 8
+; CWRLD_ATTR_HSA-NEXT:    store ptr @indirect_callee2, ptr @G2, align 8
+; CWRLD_ATTR_HSA-NEXT:    ret float [[FADD]]
 ;
   %f = call float %fptr()
   %fadd = fadd float %f, 1.0
+  store ptr @indirect_callee1, ptr @G1
+  store ptr @indirect_callee2, ptr @G2
   ret float %fadd
 }
 
+define float @indirect_callee1() {
+; AKF_HSA-LABEL: define {{[^@]+}}@indirect_callee1() {
+; AKF_HSA-NEXT:    ret float 0x40091EB860000000
+;
+; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_callee1
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
+; ATTRIBUTOR_HSA-NEXT:    ret float 0x40091EB860000000
+;
+  ret float 0x40091EB860000000
+}
+define float @indirect_callee2(float noundef %arg) {
+; AKF_HSA-LABEL: define {{[^@]+}}@indirect_callee2
+; AKF_HSA-SAME: (float noundef [[ARG:%.*]]) {
+; AKF_HSA-NEXT:    ret float [[ARG]]
+;
+; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_callee2
+; ATTRIBUTOR_HSA-SAME: (float noundef [[ARG:%.*]]) #[[ATTR19]] {
+; ATTRIBUTOR_HSA-NEXT:    ret float [[ARG]]
+;
+  ret float %arg
+}
+
 declare float @extern() #3
 define float @func_extern_call() #3 {
 ; AKF_HSA-LABEL: define {{[^@]+}}@func_extern_call
@@ -845,7 +892,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -861,7 +908,7 @@ define void @func_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -877,7 +924,7 @@ define void @func_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -893,7 +940,7 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -928,7 +975,7 @@ define internal void @enqueue_block_def() #6 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR26:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -941,7 +988,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR26:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR27:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @enqueue_block_decl()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -956,7 +1003,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @enqueue_block_def()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -969,7 +1016,7 @@ define void @unused_enqueue_block() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -980,7 +1027,7 @@ define internal void @known_func() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -994,7 +1041,7 @@ define amdgpu_kernel void @kern_callsite_enqueue_block() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR29:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -1040,15 +1087,17 @@ attributes #6 = { &quot;enqueued-block&quot; }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind &quot;amdgpu-waves-per-eu&quot;=&quot;4,10&quot; &quot;uniform-work-group-size&quot;=&quot;false&quot; }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind &quot;amdgpu-no-completion-action&quot; &quot;amdgpu-no-default-queue&quot; &quot;amdgpu-no-dispatch-id&quot; &quot;amdgpu-no-dispatch-ptr&quot; &quot;amdgpu-no-heap-ptr&quot; &quot;amdgpu-no-hostcall-ptr&quot; &quot;amdgpu-no-implicitarg-ptr&quot; &quot;amdgpu-no-lds-kernel-id&quot; &quot;amdgpu-no-multigrid-sync-arg&quot; &quot;amdgpu-no-queue-ptr&quot; &quot;amdgpu-no-workgroup-id-x&quot; &a...
<truncated>
</pre>
</details>


https://github.com/llvm/llvm-project/pull/66488