[llvm] [AMDGPU] Implement LSR cost model for GFX9+ (PR #184138)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 2 06:48:00 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (michaelselehov)
<details>
<summary>Changes</summary>
AMDGPU previously had no target-specific LSR cost model, so the generic heuristic would often introduce extra induction variables and base-add chains that hurt VALU throughput on GFX9+ (observed on gfx942).
Implement a custom cost model:
- isLSRCostLess(): prioritize per-iteration instruction count over setup costs, penalize IV multiplies, and demote register count. Pre-GFX9 falls back to the default comparator.
- getScalingFactorCost(): report that base+scale*index addressing requires an extra ADD instruction.
- isNumRegsMajorCostOfLSR(): return false.
- shouldDropLSRSolutionIfLessProfitable(): return true.
Assisted-by: Claude Opus
---
Patch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/184138.diff
13 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+42)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (+10)
- (modified) llvm/test/CodeGen/AMDGPU/copy-to-reg.ll (+10-12)
- (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll (+39-46)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+70-88)
- (modified) llvm/test/CodeGen/AMDGPU/idiv-licm.ll (+148-174)
- (added) llvm/test/CodeGen/AMDGPU/lsr-cost-model-vector-iv.ll (+37)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+91-101)
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+6673-6640)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-pattern.ll (+18-19)
- (modified) llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll (+6-8)
- (modified) llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll (+3-3)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index d746ce65a6288..d50dd5bbb0e0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -16,6 +16,7 @@
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/InlineCost.h"
@@ -1703,3 +1704,44 @@ GCNTTIImpl::getInstructionUniformity(const Value *V) const {
return InstructionUniformity::Default;
}
+
+InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
+ StackOffset BaseOffset,
+ bool HasBaseReg, int64_t Scale,
+ unsigned AddrSpace) const {
+ // AMDGPU has limited addressing modes. base+scale*index requires an extra
+ // ADD instruction, unlike architectures with rich addressing modes.
+ if (HasBaseReg && Scale != 0)
+ return 1;
+ return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
+ AddrSpace);
+}
+
+bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A,
+ const TTI::LSRCost &B) const {
+ // GFX9+: favor lower per-iteration work over preheader/setup costs.
+ // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
+ // effective instruction count (base+scale*index requires a separate ADD).
+ if (getST()->getGeneration() >= AMDGPUSubtarget::GFX9) {
+ unsigned EffInsnsA = A.Insns + A.ScaleCost;
+ unsigned EffInsnsB = B.Insns + B.ScaleCost;
+
+ return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,
+ A.SetupCost, A.ImmCost, A.NumRegs) <
+ std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,
+ B.SetupCost, B.ImmCost, B.NumRegs);
+ }
+
+ // Pre-GFX9: keep the default behavior.
+ return BaseT::isLSRCostLess(A, B);
+}
+
+bool GCNTTIImpl::isNumRegsMajorCostOfLSR() const {
+ // isLSRCostLess de-prioritizes register count; keep consistent.
+ return false;
+}
+
+bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
+ // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 3ec157aacd0aa..24d4ec8d85d45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -311,6 +311,16 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
unsigned getNumberOfParts(Type *Tp) const override;
InstructionUniformity getInstructionUniformity(const Value *V) const override;
+
+ InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
+ StackOffset BaseOffset,
+ bool HasBaseReg, int64_t Scale,
+ unsigned AddrSpace) const override;
+
+ bool isLSRCostLess(const TTI::LSRCost &A,
+ const TTI::LSRCost &B) const override;
+ bool isNumRegsMajorCostOfLSR() const override;
+ bool shouldDropLSRSolutionIfLessProfitable() const override;
};
} // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
index 931a14473c340..f5223d5553c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -17,14 +17,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a,
; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_mov_b32 s0, 0
-; GFX7-NEXT: s_mov_b32 s1, 0
; GFX7-NEXT: .LBB0_1: ; %loop
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mov_b32_e32 v0, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: s_add_i32 s1, s1, 1
-; GFX7-NEXT: s_add_i32 s0, s0, 4
-; GFX7-NEXT: s_cmp_lt_u32 s1, 16
+; GFX7-NEXT: s_lshl_b32 s1, s0, 2
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: s_add_i32 s0, s0, 1
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: s_cmp_lt_u32 s0, 16
; GFX7-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
; GFX7-NEXT: s_cbranch_scc1 .LBB0_1
; GFX7-NEXT: ; %bb.2: ; %done
@@ -45,14 +44,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a,
; GFX8-NEXT: s_add_u32 s88, s88, s11
; GFX8-NEXT: s_addc_u32 s89, s89, 0
; GFX8-NEXT: s_mov_b32 s0, 0
-; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: .LBB0_1: ; %loop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: s_add_i32 s1, s1, 1
-; GFX8-NEXT: s_add_i32 s0, s0, 4
-; GFX8-NEXT: s_cmp_lt_u32 s1, 16
+; GFX8-NEXT: s_lshl_b32 s1, s0, 2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_i32 s0, s0, 1
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_cmp_lt_u32 s0, 16
; GFX8-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; GFX8-NEXT: s_cbranch_scc1 .LBB0_1
; GFX8-NEXT: ; %bb.2: ; %done
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index 8b7c49b5931af..2b093539b9e6d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2741,16 +2741,15 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-SDAG: ; %bb.0: ; %bb
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
-; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
+; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB116_1
; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX1250-SDAG-NEXT: s_endpgm
@@ -2758,19 +2757,19 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv:
; GFX1250-GISEL: ; %bb.0: ; %bb
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
-; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4
+; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB116_1
; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX1250-GISEL-NEXT: s_endpgm
@@ -2779,16 +2778,15 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-NOECC: ; %bb.0: ; %bb
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NOECC-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-NOECC-NEXT: s_movk_i32 s0, 0x100
; GFX1250-NOECC-NEXT: .LBB116_1: ; %bb3
; GFX1250-NOECC-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NOECC-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NOECC-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX1250-NOECC-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-NOECC-NEXT: s_wait_dscnt 0x0
-; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0x400
+; GFX1250-NOECC-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-NOECC-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-NOECC-NEXT: s_cbranch_scc0 .LBB116_1
; GFX1250-NOECC-NEXT: ; %bb.2: ; %bb2
; GFX1250-NOECC-NEXT: s_endpgm
@@ -2815,19 +2813,17 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-SDAG: ; %bb.0: ; %bb
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
-; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
-; GFX1250-SDAG-NEXT: ; kill: killed $sgpr4_sgpr5
+; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1
; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX1250-SDAG-NEXT: s_endpgm
@@ -2835,22 +2831,21 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload:
; GFX1250-GISEL: ; %bb.0: ; %bb
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
-; GFX1250-GISEL-NEXT: ; kill: killed $vgpr4 killed $vgpr5
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4
+; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1
; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX1250-GISEL-NEXT: s_endpgm
@@ -2859,19 +2854,17 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-NOECC: ; %bb.0: ; %bb
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NOECC-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-NOECC-NEXT: s_movk_i32 s0, 0x100
; GFX1250-NOECC-NEXT: .LBB117_1: ; %bb3
; GFX1250-NOECC-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NOECC-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NOECC-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX1250-NOECC-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-NOECC-NEXT: s_wait_dscnt 0x0
-; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0x400
-; GFX1250-NOECC-NEXT: ; kill: killed $sgpr4_sgpr5
+; GFX1250-NOECC-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-NOECC-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-NOECC-NEXT: s_cbranch_scc0 .LBB117_1
; GFX1250-NOECC-NEXT: ; %bb.2: ; %bb2
; GFX1250-NOECC-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 49de34820c4c0..e792f2e7e6d8e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4714,17 +4714,16 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX9-LABEL: global_addr_64bit_lsr_iv:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: s_movk_i32 s0, 0x100
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB132_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_add_u32 s4, s2, s0
-; GFX9-NEXT: s_addc_u32 s5, s3, s1
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 4
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
+; GFX9-NEXT: s_add_i32 s0, s0, -1
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc0 .LBB132_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -4732,17 +4731,16 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX10-LABEL: global_addr_64bit_lsr_iv:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-NEXT: s_movk_i32 s0, 0x100
; GFX10-NEXT: .LBB132_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
-; GFX10-NEXT: s_add_u32 s4, s2, s0
-; GFX10-NEXT: s_addc_u32 s5, s3, s1
-; GFX10-NEXT: s_add_u32 s0, s0, 4
-; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
+; GFX10-NEXT: s_add_i32 s0, s0, -1
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX10-NEXT: s_add_u32 s2, s2, 4
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc0 .LBB132_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@@ -4750,17 +4748,15 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX11-LABEL: global_addr_64bit_lsr_iv:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: s_movk_i32 s0, 0x100
; GFX11-NEXT: .LBB132_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_u32 s4, s2, s0
-; GFX11-NEXT: s_addc_u32 s5, s3, s1
-; GFX11-NEXT: s_add_u32 s0, s0, 4
-; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_add_i32 s0, s0, -1
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB132_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm
@@ -4768,38 +4764,34 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv:
; GFX12-SDAG: ; %bb.0: ; %bb
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX12-SDAG-NEXT: .LBB132_1: ; %bb3
; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
-; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB132_1
; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv:
; GFX12-GISEL: ; %bb.0: ; %bb
-; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2
-; GFX12-GISEL-NEXT: s_wait_alu depctr_va_vcc(0)
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
-; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4
-; GFX12-GISEL-NEXT: s_wait_alu depctr_va_vcc(0)
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc
-; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
+; GFX12-GISEL-NEXT: v_add_co_u32 v0, s[0:1], v0, 4
+; GFX12-GISEL-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s[0:1]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB132_1
; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX12-GISEL-NEXT: s_endpgm
@@ -4824,20 ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/184138
More information about the llvm-commits
mailing list