[llvm] [AMDGPU] Remove the AnnotateKernelFeatures pass (PR #130198)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 6 14:48:58 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jun Wang (jwanggit86)
<details>
<summary>Changes</summary>
Previously the AnnotateKernelFeatures pass infers two attributes: amdgpu-calls and amdgpu-stack-objects, which are used to help determine if flat scratch init is allowed. PR #<!-- -->118907 created the amdgpu-no-flat-scratch-init attribute. Continuing with that work, this patch makes use of this attribute to determine flat scratch init, replacing amdgpu-calls and amdgpu-stack-objects. This also leads to the removal of the AnnotateKernelFeatures pass.
---
Patch is 2.25 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130198.diff
116 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (-3)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp (-9)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (-7)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.cpp (+6-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll (+340-28)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+360-30)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+40-31)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll (+32-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll (+4-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll (+24)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+33)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+33)
- (modified) llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll (+15-3)
- (modified) llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll (-62)
- (modified) llvm/test/CodeGen/AMDGPU/always-uniform.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll (-331)
- (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll (-165)
- (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll (-103)
- (modified) llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll (+14-7)
- (modified) llvm/test/CodeGen/AMDGPU/attributor-noopt.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/code-object-v3.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll (-13)
- (modified) llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll (+98-2)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.f16.ll (+66)
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll (+19-1)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+217-19)
- (modified) llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll (+5-2)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll (+60-20)
- (modified) llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll (+48)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+60)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.f16.ll (+61-1)
- (modified) llvm/test/CodeGen/AMDGPU/half.ll (+231)
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/hsa.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll (+32-4)
- (modified) llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll (+53-5)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+206-8)
- (modified) llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/kernarg-size.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+5-25)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll (+12)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll (+12)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll (+7-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll (+56-14)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll (+96-18)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+120-6)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-f64.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+121-4)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+79-4)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i64.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+160-4)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+123-6)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+90-15)
- (modified) llvm/test/CodeGen/AMDGPU/load-select-ptr.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll (+156-144)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll (+1380)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll (+75)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll (+1380)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (+1380)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+66)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll (+1365)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll (+1320)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll (+273)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll (+15)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll (+276)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll (+261)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll (+276)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll (+276)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll (+9)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll (+34-25)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (+18-12)
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+210)
- (modified) llvm/test/CodeGen/AMDGPU/pack.v2f16.ll (+21)
- (modified) llvm/test/CodeGen/AMDGPU/pack.v2i16.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll (-8)
- (modified) llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll (+83-95)
- (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs.ll (+180-199)
- (modified) llvm/test/CodeGen/AMDGPU/sad.ll (+84-30)
- (modified) llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll (+16)
- (modified) llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll (+236-228)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/shift-i128.ll (+21-3)
- (modified) llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll (-15)
- (modified) llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll (+62-8)
- (modified) llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+14-2)
- (modified) llvm/test/CodeGen/AMDGPU/udiv.ll (+45)
- (modified) llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll (+83-8)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll (+1-1)
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll (+4-3)
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll (+4-3)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 57297288eecb4..c30c1cd3c8fb0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -95,11 +95,8 @@ void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &);
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
-Pass *createAMDGPUAnnotateKernelFeaturesPass();
Pass *createAMDGPUAttributorLegacyPass();
void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
-void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
-extern char &AMDGPUAnnotateKernelFeaturesID;
// DPP/Iterative option enables the atomic optimizer with given strategy
// whereas None disables the atomic optimizer.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index a9bd41382c255..9c9fa5c6e2f0f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -52,11 +52,6 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
char AMDGPUAnnotateKernelFeatures::ID = 0;
-char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
-
-INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
- "Add AMDGPU function attributes", false, false)
-
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
bool HaveStackObjects = false;
bool Changed = false;
@@ -131,7 +126,3 @@ bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
TM = &TPC->getTM<TargetMachine>();
return false;
}
-
-Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
- return new AMDGPUAnnotateKernelFeatures();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce3dcd920bce3..bb139e2c185c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -510,7 +510,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
initializeAMDGPUAttributorLegacyPass(*PR);
- initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPUAtomicOptimizerPass(*PR);
@@ -1294,12 +1293,6 @@ void AMDGPUPassConfig::addIRPasses() {
}
void AMDGPUPassConfig::addCodeGenPrepare() {
- if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
- // FIXME: This pass adds 2 hacky attributes that can be replaced with an
- // analysis, and should be removed.
- addPass(createAMDGPUAnnotateKernelFeaturesPass());
- }
-
if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 55af5826e90d0..c812837c29a46 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -601,12 +601,6 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
const CallingConv::ID CC = F.getCallingConv();
const bool IsKernel =
CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
- // FIXME: Should have analysis or something rather than attribute to detect
- // calls.
- const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
- // FIXME: This attribute is a hack, we just need an analysis on the function
- // to look for allocas.
- const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
KernargSegmentPtr = true;
@@ -629,12 +623,14 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
DispatchID = true;
}
- // TODO: This could be refined a lot. The attribute is a poor way of
- // detecting calls or stack objects that may require it before argument
- // lowering.
+ const bool IsNoFlatScratchInitSet = F.hasFnAttribute("amdgpu-no-flat-scratch-init");
+
if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
(IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
- (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
+ // The line below: If enableFlatScratch() is true, whether
+ // no-flat-scratch-init is set is not important. If enableFlatScratch()
+ // is false, FlatScratchInit cannot be true for graphics CC.
+ (ST.enableFlatScratch() || (!IsNoFlatScratchInitSet && !AMDGPU::isGraphics(CC))) &&
!ST.flatScratchIsArchitected()) {
FlatScratchInit = true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index b96fc71be057e..6c2272c389a61 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -97,11 +103,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -112,11 +121,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -287,6 +299,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_dec_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -302,6 +317,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_dec_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -359,6 +377,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_dec_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -376,6 +397,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_dec_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -436,6 +460,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -453,6 +480,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -513,6 +543,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_dec_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -525,6 +558,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_dec_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -575,6 +611,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_dec_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -589,6 +628,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_dec_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -642,6 +684,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -656,6 +701,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -710,7 +758,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -718,6 +768,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -732,7 +783,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -740,6 +793,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -802,6 +856,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -819,6 +876,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -878,6 +938,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -893,6 +956,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -908,6 +974,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -922,6 +990,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_ret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -958,6 +1030,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -975,6 +1050,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -992,6 +1070,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1006,6 +1086,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1045,6 +1129,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -1062,6 +1149,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/130198
More information about the llvm-commits
mailing list