[llvm] AMDGPU: Move attributor into optimization pipeline (PR #83131)

via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 27 05:16:51 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

Removing it from the codegen pipeline induces a lot of test churn because llc is no longer optimizing out implicit arguments to kernels.

Mostly mechanical, but there are some creative test updates. I preferred to take the changes as-is in tests where the ABI isn't relevant. In cases where it's more relevant, or the optimize out logic was too ingrained in the test, I pre-run the optimization. Some cases manually add attributes to disable inputs.

---

Patch is 17.97 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83131.diff


534 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+8-5) 
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll (+278-259) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+297-278) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll (+10-10) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll (+38-38) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll (+15-15) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+138-138) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll (+5-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+104-88) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll (+16-16) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+258-258) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll (+56-56) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll (+20-20) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll (+5-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll (+236-236) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll (+120) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll (+121-148) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll (+295-233) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll (+9-9) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll (+58-55) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll (+12-11) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll (+12-11) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll (+17-17) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll (+13-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll (+49-49) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll (+16-16) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll (+63-63) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll (+8-7) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll (+5-4) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll (+11-9) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll (+42-40) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll (+66-48) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll (+23-23) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+369-369) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll (+18-17) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll (+86-86) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll (+86-86) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+242-242) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll (+30-30) 
- (modified) llvm/test/CodeGen/AMDGPU/add.ll (+168-159) 
- (modified) llvm/test/CodeGen/AMDGPU/add.v2i16.ll (+122-102) 
- (modified) llvm/test/CodeGen/AMDGPU/addrspacecast.ll (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+65-64) 
- (modified) llvm/test/CodeGen/AMDGPU/agpr-register-count.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/always-uniform.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/amd.endpgm.ll (+17-17) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+1511-1504) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll (+24-24) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll (+18-18) 
- (modified) llvm/test/CodeGen/AMDGPU/amdpal-elf.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/anyext.ll (+19-19) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+808-702) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+701-592) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+1162-1063) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+652-554) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+722-626) 
- (modified) llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll (+24-24) 
- (modified) llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/attributor-noopt.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+280-280) 
- (modified) llvm/test/CodeGen/AMDGPU/bfe-combine.ll (+18-18) 
- (modified) llvm/test/CodeGen/AMDGPU/bfe-patterns.ll (+28-28) 
- (modified) llvm/test/CodeGen/AMDGPU/bfi_int.ll (+115-121) 
- (modified) llvm/test/CodeGen/AMDGPU/bfi_nested.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/bfm.ll (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/bitreverse.ll (+89-73) 
- (modified) llvm/test/CodeGen/AMDGPU/br_cc.f16.ll (+32-32) 
- (modified) llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation.ll (+46-46) 
- (modified) llvm/test/CodeGen/AMDGPU/bswap.ll (+21-21) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll (+14-14) 
- (modified) llvm/test/CodeGen/AMDGPU/build_vector.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/call-constexpr.ll (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll (+18-18) 
- (modified) llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll (+3-1) 
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+135-77) 
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+394-382) 
- (modified) llvm/test/CodeGen/AMDGPU/cc-update.ll (+9-9) 
- (modified) llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+13-13) 
- (modified) llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (+133-101) 
- (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+455-272) 
- (modified) llvm/test/CodeGen/AMDGPU/cluster_stores.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/code-object-v3.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll (+3-23) 
- (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll (+113-113) 
- (modified) llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll (+74-76) 
- (modified) llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll (+16-16) 
- (modified) llvm/test/CodeGen/AMDGPU/copy_to_scc.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/ctlz.ll (+144-126) 
- (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+122-122) 
- (modified) llvm/test/CodeGen/AMDGPU/ctpop16.ll (+44-44) 
- (modified) llvm/test/CodeGen/AMDGPU/ctpop64.ll (+62-62) 
- (modified) llvm/test/CodeGen/AMDGPU/cttz.ll (+84-84) 
- (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+83-83) 
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+141-110) 
- (modified) llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll (+102-99) 
- (modified) llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+76-76) 
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/ds-alignment.ll (+45-45) 
- (modified) llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll (+13-13) 
- (modified) llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll (+6-4) 
- (modified) llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll (+49-36) 
- (modified) llvm/test/CodeGen/AMDGPU/ds_read2.ll (+117-126) 
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+75-75) 
- (modified) llvm/test/CodeGen/AMDGPU/early-inline.ll (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/elf-notes.ll (+3-1) 
- (modified) llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll (+112-101) 
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll (+10-10) 
- (modified) llvm/test/CodeGen/AMDGPU/fabs.f16.ll (+88-79) 
- (modified) llvm/test/CodeGen/AMDGPU/fadd.f16.ll (+78-54) 
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll (+18-18) 
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+224-216) 
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+274-238) 
- (modified) llvm/test/CodeGen/AMDGPU/fcmp.f16.ll (+466-466) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+311-290) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+169-168) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll (+218-219) 
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+155-127) 
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.ll (+148-142) 
- (modified) llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll (+46-46) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll (+26-26) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll (+242-182) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+371-343) 
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics.ll (+1613-1613) 
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+129-129) 
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+552-552) 
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+49-49) 
- (modified) llvm/test/CodeGen/AMDGPU/fma-combine.ll (+411-377) 
- (modified) llvm/test/CodeGen/AMDGPU/fma.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/fmax3.ll (+16-16) 
- (modified) llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/fmed3.ll (+470-348) 
- (modified) llvm/test/CodeGen/AMDGPU/fmin3.ll (+24-24) 
- (modified) llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll (+16-16) 
- (modified) llvm/test/CodeGen/AMDGPU/fminimum.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll (+162-162) 
- (modified) llvm/test/CodeGen/AMDGPU/fmul.f16.ll (+122-122) 
- (modified) llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll (+220-136) 
- (modified) llvm/test/CodeGen/AMDGPU/fnearbyint.ll (+62-61) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+36-36) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+89-89) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll (+3-1) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+14-14) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg.f16.ll (+66-62) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll (+10-10) 
- (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll (+10-10) 
- (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll (+16-16) 
- (modified) llvm/test/CodeGen/AMDGPU/fp-classify.ll (+181-181) 
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll (+113-120) 
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll (+113-120) 
- (modified) llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+282-282) 
- (modified) llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll (+64-64) 
- (modified) llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll (+64-64) 
- (modified) llvm/test/CodeGen/AMDGPU/fp_to_sint.ll (+59-59) 
- (modified) llvm/test/CodeGen/AMDGPU/fp_to_uint.ll (+52-52) 
- (modified) llvm/test/CodeGen/AMDGPU/fpext.f16.ll (+60-68) 
- (modified) llvm/test/CodeGen/AMDGPU/fptosi.f16.ll (+25-25) 
- (modified) llvm/test/CodeGen/AMDGPU/fptoui.f16.ll (+28-27) 
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll (+80-80) 
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.ll (+96-94) 
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+224-224) 
- (modified) llvm/test/CodeGen/AMDGPU/fshl.ll (+171-173) 
- (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+103-105) 
- (modified) llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll (+92-89) 
- (modified) llvm/test/CodeGen/AMDGPU/fsub.f16.ll (+78-78) 
- (modified) llvm/test/CodeGen/AMDGPU/function-args-inreg.ll (+685-664) 
- (modified) llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/gds-allocation.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll (+17-13) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll (+354-354) 
- (modified) llvm/test/CodeGen/AMDGPU/global-constant.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+1319-1319) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+122-122) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+812-812) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll (+53-53) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+485-485) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+126-126) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+126-126) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+495-495) 
- (modified) llvm/test/CodeGen/AMDGPU/global_smrd.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/half.ll (+183-183) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll (+34-24) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll (+9-7) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/hsa.ll (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/idiv-licm.ll (+230-226) 
- (modified) llvm/test/CodeGen/AMDGPU/idot2.ll (+347-328) 
- (modified) llvm/test/CodeGen/AMDGPU/idot4s.ll (+394-340) 
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+736-638) 
- (modified) llvm/test/CodeGen/AMDGPU/idot8s.ll (+367-355) 
- (modified) llvm/test/CodeGen/AMDGPU/idot8u.ll (+466-459) 
- (modified) llvm/test/CodeGen/AMDGPU/imm.ll (+266-234) 
- (modified) llvm/test/CodeGen/AMDGPU/imm16.ll (+272-258) 
- (modified) llvm/test/CodeGen/AMDGPU/immv216.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll (+20-20) 
- (modified) llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll (+43-39) 
- (modified) llvm/test/CodeGen/AMDGPU/infinite-loop.ll (+6-10) 
- (modified) llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/inline-attr.ll (+10-7) 
- (modified) llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+368-366) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+426-426) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+344-283) 
- (modified) llvm/test/CodeGen/AMDGPU/ipra.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/kernarg-size.ll (+5-2) 
- (modified) llvm/test/CodeGen/AMDGPU/kernel-args.ll (+468-465) 
- (modified) llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll (+50-48) 
- (modified) llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll () 
- (modified) llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll () 
- (modified) llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll () 
- (modified) llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (-12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll (+103-89) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll (+36-1) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll (+7-2) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll (+556-556) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll (+534-534) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll (+14-13) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll (+3-3) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0d830df1f1f1df..3373942a7782b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -781,6 +781,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
 
         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
       });
+
+  // FIXME: Why is AMDGPUAttributor not in CGSCC?
+  PB.registerOptimizerLastEPCallback(
+      [this](ModulePassManager &MPM, OptimizationLevel Level) {
+        if (Level != OptimizationLevel::O0) {
+          MPM.addPass(AMDGPUAttributorPass(*this));
+        }
+      });
 }
 
 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
@@ -1043,11 +1051,6 @@ void AMDGPUPassConfig::addIRPasses() {
     addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
   }
 
-  // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
-  // after their introduction
-  if (TM.getOptLevel() > CodeGenOptLevel::None)
-    addPass(createAMDGPUAttributorLegacyPass());
-
   if (TM.getOptLevel() > CodeGenOptLevel::None)
     addPass(createInferAddressSpacesPass());
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 4f106bf0dfb114..b22b6609c68673 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -678,6 +678,12 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
         break;
       }
     }
+
+    // FIXME: We can spill incoming arguments and restore at the end of the
+    // prolog.
+    if (!ScratchWaveOffsetReg)
+      report_fatal_error(
+          "could not find temporary scratch offset register in prolog");
   } else {
     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index a38b6e3263882c..359c1e53de99e3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -6,8 +6,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-LABEL: s_add_u64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s0, s6, s0
@@ -22,8 +22,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX12-LABEL: s_add_u64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[6:7], s[0:1]
@@ -58,8 +58,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-LABEL: s_sub_u64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s0, s6, s0
@@ -74,8 +74,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX12-LABEL: s_sub_u64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index b04bc04ab22691..705bcbddf227a6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -16,8 +16,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 {
 ; CI-LABEL: lds_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -31,8 +31,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ;
 ; VI-LABEL: lds_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -46,8 +46,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: lds_atomic_dec_ret_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -59,11 +59,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ;
 ; GFX10-LABEL: lds_atomic_dec_ret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    ds_dec_rtn_u32 v0, v0, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -74,11 +74,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ;
 ; GFX11-LABEL: lds_atomic_dec_ret_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x8
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2
 ; GFX11-NEXT:    ds_dec_rtn_u32 v0, v0, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -95,8 +95,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 {
 ; CI-LABEL: lds_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -110,8 +110,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ;
 ; VI-LABEL: lds_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -125,8 +125,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: lds_atomic_dec_ret_i32_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
@@ -138,11 +138,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ;
 ; GFX10-LABEL: lds_atomic_dec_ret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x8
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    ds_dec_rtn_u32 v0, v1, v0 offset:16
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -153,11 +153,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ;
 ; GFX11-LABEL: lds_atomic_dec_ret_i32_offset:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x8
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    ds_dec_rtn_u32 v0, v1, v0 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -175,7 +175,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 {
 ; CI-LABEL: lds_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
+; CI-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -186,7 +186,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 {
 ;
 ; VI-LABEL: lds_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
+; VI-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -197,7 +197,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 {
 ;
 ; GFX9-LABEL: lds_atomic_dec_noret_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -207,7 +207,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 {
 ;
 ; GFX10-LABEL: lds_atomic_dec_noret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
@@ -218,7 +218,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 {
 ;
 ; GFX11-LABEL: lds_atomic_dec_noret_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0
 ; GFX11-NEXT:    ds_dec_u32 v0, v1
@@ -232,7 +232,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 {
 define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr) #1 {
 ; CI-LABEL: lds_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
+; CI-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -243,7 +243,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr
 ;
 ; VI-LABEL: lds_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
+; VI-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -254,7 +254,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr
 ;
 ; GFX9-LABEL: lds_atomic_dec_noret_i32_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
@@ -264,7 +264,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr
 ;
 ; GFX10-LABEL: lds_atomic_dec_noret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
@@ -275,7 +275,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr
 ;
 ; GFX11-LABEL: lds_atomic_dec_noret_i32_offset:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    ds_dec_u32 v1, v0 offset:16
@@ -290,7 +290,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr
 define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
 ; CI-LABEL: global_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -305,7 +305,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ;
 ; VI-LABEL: global_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -320,7 +320,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: global_atomic_dec_ret_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -332,7 +332,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ;
 ; GFX10-LABEL: global_atomic_dec_ret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -345,7 +345,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ;
 ; GFX11-LABEL: global_atomic_dec_ret_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, v0, s[2:3] glc
@@ -364,7 +364,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
 ; CI-LABEL: global_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -381,7 +381,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ;
 ; VI-LABEL: global_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -398,7 +398,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: global_atomic_dec_ret_i32_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -410,7 +410,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ;
 ; GFX10-LABEL: global_atomic_dec_ret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -423,7 +423,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ;
 ; GFX11-LABEL: global_atomic_dec_ret_i32_offset:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
@@ -443,7 +443,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
 ; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -460,7 +460,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -477,7 +477,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ;
 ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -489,7 +489,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ;
 ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -502,7 +502,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ;
 ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
@@ -522,7 +522,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/83131


More information about the llvm-commits mailing list