[llvm] [AMDGPU] Remove the AnnotateKernelFeatures pass (PR #130198)

via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 6 14:48:58 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Jun Wang (jwanggit86)

<details>
<summary>Changes</summary>

Previously the AnnotateKernelFeatures pass infers two attributes: amdgpu-calls and amdgpu-stack-objects, which are used to help determine if flat scratch init is allowed. PR #<!-- -->118907 created the amdgpu-no-flat-scratch-init attribute. Continuing with that work, this patch makes use of this attribute to determine flat scratch init, replacing amdgpu-calls and amdgpu-stack-objects. This also leads to the removal of the AnnotateKernelFeatures pass.

---

Patch is 2.25 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130198.diff


116 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (-3) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp (-9) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (-7) 
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.cpp (+6-10) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll (+340-28) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+360-30) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+40-31) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll (+32-4) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll (+4-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll (+24) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+33) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+33) 
- (modified) llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll (+15-3) 
- (modified) llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll (-62) 
- (modified) llvm/test/CodeGen/AMDGPU/always-uniform.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll (-331) 
- (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll (-165) 
- (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll (-103) 
- (modified) llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll (+14-7) 
- (modified) llvm/test/CodeGen/AMDGPU/attributor-noopt.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll (+20-20) 
- (modified) llvm/test/CodeGen/AMDGPU/code-object-v3.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll (-13) 
- (modified) llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll (+98-2) 
- (modified) llvm/test/CodeGen/AMDGPU/fabs.f16.ll (+66) 
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll (+19-1) 
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+217-19) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll (+5-2) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll (+60-20) 
- (modified) llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll (+48) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+60) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg.f16.ll (+61-1) 
- (modified) llvm/test/CodeGen/AMDGPU/half.ll (+231) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll (+32-4) 
- (modified) llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll (+53-5) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+206-8) 
- (modified) llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/kernarg-size.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+5-25) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll (+12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll (+12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll (+7-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll (+56-14) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll (+96-18) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+120-6) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-f64.ll (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+121-4) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+79-4) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i64.ll (+18) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+160-4) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+123-6) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+90-15) 
- (modified) llvm/test/CodeGen/AMDGPU/load-select-ptr.ll (+2-1) 
- (modified) llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll (+156-144) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll (+1380) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll (+75) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll (+1380) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (+1380) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+66) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll (+1365) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll (+1320) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll (+273) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll (+15) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll (+276) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll (+261) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (+18) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll (+276) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll (+276) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll (+9) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll (+34-25) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (+18-12) 
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+210) 
- (modified) llvm/test/CodeGen/AMDGPU/pack.v2f16.ll (+21) 
- (modified) llvm/test/CodeGen/AMDGPU/pack.v2i16.ll (+18) 
- (modified) llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll (-8) 
- (modified) llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll (+24-24) 
- (modified) llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll (+83-95) 
- (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs.ll (+180-199) 
- (modified) llvm/test/CodeGen/AMDGPU/sad.ll (+84-30) 
- (modified) llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll (+16) 
- (modified) llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll (+236-228) 
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/shift-i128.ll (+21-3) 
- (modified) llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll (-15) 
- (modified) llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll (+62-8) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+14-2) 
- (modified) llvm/test/CodeGen/AMDGPU/udiv.ll (+45) 
- (modified) llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll (+83-8) 
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll (+1-1) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll (+4-3) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll (+4-3) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 57297288eecb4..c30c1cd3c8fb0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -95,11 +95,8 @@ void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &);
 
 void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
 
-Pass *createAMDGPUAnnotateKernelFeaturesPass();
 Pass *createAMDGPUAttributorLegacyPass();
 void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
-void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
-extern char &AMDGPUAnnotateKernelFeaturesID;
 
 // DPP/Iterative option enables the atomic optimizer with given strategy
 // whereas None disables the atomic optimizer.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index a9bd41382c255..9c9fa5c6e2f0f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -52,11 +52,6 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
 
 char AMDGPUAnnotateKernelFeatures::ID = 0;
 
-char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
-
-INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
-                "Add AMDGPU function attributes", false, false)
-
 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
   bool HaveStackObjects = false;
   bool Changed = false;
@@ -131,7 +126,3 @@ bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
   TM = &TPC->getTM<TargetMachine>();
   return false;
 }
-
-Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
-  return new AMDGPUAnnotateKernelFeatures();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce3dcd920bce3..bb139e2c185c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -510,7 +510,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUSwLowerLDSLegacyPass(*PR);
   initializeAMDGPUAttributorLegacyPass(*PR);
-  initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
   initializeAMDGPUAtomicOptimizerPass(*PR);
@@ -1294,12 +1293,6 @@ void AMDGPUPassConfig::addIRPasses() {
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
-  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
-    // FIXME: This pass adds 2 hacky attributes that can be replaced with an
-    // analysis, and should be removed.
-    addPass(createAMDGPUAnnotateKernelFeaturesPass());
-  }
-
   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
       EnableLowerKernelArguments)
     addPass(createAMDGPULowerKernelArgumentsPass());
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 55af5826e90d0..c812837c29a46 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -601,12 +601,6 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
   const CallingConv::ID CC = F.getCallingConv();
   const bool IsKernel =
       CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
-  // FIXME: Should have analysis or something rather than attribute to detect
-  // calls.
-  const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
-  // FIXME: This attribute is a hack, we just need an analysis on the function
-  // to look for allocas.
-  const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
 
   if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
     KernargSegmentPtr = true;
@@ -629,12 +623,14 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
       DispatchID = true;
   }
 
-  // TODO: This could be refined a lot. The attribute is a poor way of
-  // detecting calls or stack objects that may require it before argument
-  // lowering.
+  const bool IsNoFlatScratchInitSet = F.hasFnAttribute("amdgpu-no-flat-scratch-init");
+
   if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
       (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
-      (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
+      // The line below: If enableFlatScratch() is true, whether
+      // no-flat-scratch-init is set is not important. If enableFlatScratch()
+      // is false, FlatScratchInit cannot be true for graphics CC.
+      (ST.enableFlatScratch() || (!IsNoFlatScratchInitSet && !AMDGPU::isGraphics(CC))) &&
       !ST.flatScratchIsArchitected()) {
     FlatScratchInit = true;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index b96fc71be057e..6c2272c389a61 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -97,11 +103,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0 offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -112,11 +121,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0 offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -287,6 +299,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -302,6 +317,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -359,6 +377,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -376,6 +397,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -436,6 +460,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -453,6 +480,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -513,6 +543,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -525,6 +558,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -575,6 +611,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -589,6 +628,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -642,6 +684,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -656,6 +701,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -710,7 +758,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, 42
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -718,6 +768,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -732,7 +783,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 42
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -740,6 +793,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -802,6 +856,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -819,6 +876,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -878,6 +938,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -893,6 +956,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -908,6 +974,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_ret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -922,6 +990,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -958,6 +1030,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -975,6 +1050,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -992,6 +1070,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1006,6 +1086,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1045,6 +1129,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT:    s_add_i32 s12, s12, s17
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -1062,6 +1149,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_add_i32 s12, s12, s17
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/130198


More information about the llvm-commits mailing list