[llvm] [AMDGPU] Generate s_lshl?_add_u32 (PR #167032)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 10 09:44:23 PST 2025
https://github.com/LU-JOHN updated https://github.com/llvm/llvm-project/pull/167032
>From a923825e39b1ca93abb986369180836ce7d90a15 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 6 Nov 2025 11:55:11 -0600
Subject: [PATCH 1/5] Generate shlN_add with sdag
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 36 ++++-
llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 ++
.../AMDGPU/amdgpu-cs-chain-fp-nosave.ll | 25 ++--
.../CodeGen/AMDGPU/atomic_cmp_swap_local.ll | 8 +-
.../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 69 ++++------
llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 12 +-
llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 24 ++--
llvm/test/CodeGen/AMDGPU/lds-relocs.ll | 6 +-
.../lower-work-group-id-intrinsics-opt.ll | 5 +-
.../AMDGPU/lower-work-group-id-intrinsics.ll | 13 +-
.../materialize-frame-index-sgpr.gfx10.ll | 46 +++----
.../CodeGen/AMDGPU/mubuf-offset-private.ll | 2 +-
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 12 +-
llvm/test/CodeGen/AMDGPU/shlN_add.ll | 84 ++++--------
.../AMDGPU/splitkit-getsubrangeformask.ll | 125 +++++++++---------
15 files changed, 217 insertions(+), 258 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9c74c654d8e35..eb94b4b22e85f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7635,6 +7635,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
unsigned Opcode = Inst.getOpcode();
unsigned NewOpcode = getVALUOp(Inst);
+ const DebugLoc &DL = Inst.getDebugLoc();
+
// Handle some special cases
switch (Opcode) {
default:
@@ -7872,7 +7874,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
- const DebugLoc &DL = Inst.getDebugLoc();
MachineOperand &Dest0 = Inst.getOperand(0);
MachineOperand &Dest1 = Inst.getOperand(1);
MachineOperand &Src0 = Inst.getOperand(2);
@@ -7897,7 +7898,36 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
}
return;
+ case AMDGPU::S_LSHL1_ADD_U32:
+ case AMDGPU::S_LSHL2_ADD_U32:
+ case AMDGPU::S_LSHL3_ADD_U32:
+ case AMDGPU::S_LSHL4_ADD_U32: {
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+
+ dbgs() << "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG\n";
+ Inst.dump();
+ unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1 :
+ Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2 :
+ Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3 : 4);
+
+ const TargetRegisterClass *NewRC =
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
+ Register DestReg = MRI.createVirtualRegister(NewRC);
+ MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+ .add(Src0)
+ .addImm(ShiftAmt)
+ .add(Src1);
+
+ legalizeOperands(*NewInstr, MDT);
+ MRI.replaceRegWith(Dest.getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
+ Worklist);
+ Inst.eraseFromParent();
+ }
+ return;
case AMDGPU::S_CSELECT_B32:
case AMDGPU::S_CSELECT_B64:
lowerSelect(Worklist, Inst, MDT);
@@ -7994,7 +8024,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
case AMDGPU::S_CVT_HI_F32_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
if (ST.useRealTrue16Insts()) {
@@ -8024,7 +8053,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F32:
case AMDGPU::S_MAXIMUM_F32: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
@@ -8042,7 +8070,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F16:
case AMDGPU::S_MAXIMUM_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
@@ -8066,7 +8093,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
case AMDGPU::V_S_RCP_F16_e64:
case AMDGPU::V_S_RSQ_F16_e64:
case AMDGPU::V_S_SQRT_F16_e64: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..93cfd5ab3750c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2012,6 +2012,14 @@ let AddedComplexity = 20 in {
>;
}
+let SubtargetPredicate = isGFX9Plus in
+foreach I = 1-4 in {
+def : GCNPat <
+ (i32 (UniformBinFrag<add> (shl_oneuse i32:$src0, (i32 I)), i32:$src1)),
+ (!cast<SOP2_Pseudo>("S_LSHL"#I#"_ADD_U32") $src0, $src1)
+>;
+}
+
// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
def : GCNPat <
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
index 06150e4277e9a..7669ae21f6635 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -51,10 +51,8 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_add_co_i32 s0, s0, 15
; GFX12-NEXT: s_mov_b32 s32, 16
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, -16
@@ -69,8 +67,7 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
; GFX942-LABEL: test_alloca_var_uniform:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX942-NEXT: s_mov_b32 s32, 16
; GFX942-NEXT: s_and_b32 s0, s0, -16
; GFX942-NEXT: v_mov_b32_e32 v0, 0
@@ -211,15 +208,13 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
; GFX12-NEXT: s_add_co_u32 s2, s2, foo at gotpcrel32@lo+12
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo at gotpcrel32@hi+24
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: s_add_co_i32 s0, s0, 15
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_mov_b32 s32, 16
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, -16
-; GFX12-NEXT: s_mov_b32 s1, s32
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s1, s32
; GFX12-NEXT: s_lshl_b32 s0, s0, 5
; GFX12-NEXT: scratch_store_b32 off, v0, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -232,8 +227,7 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
; GFX942-LABEL: test_alloca_and_call_var_uniform:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX942-NEXT: s_and_b32 s0, s0, -16
; GFX942-NEXT: s_lshl_b32 s2, s0, 6
; GFX942-NEXT: s_getpc_b64 s[0:1]
@@ -396,14 +390,12 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
; GFX12-NEXT: s_add_co_u32 s2, s2, foo at gotpcrel32@lo+12
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo at gotpcrel32@hi+24
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: s_add_co_i32 s0, s0, 15
; GFX12-NEXT: s_mov_b32 s32, 16
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, -16
-; GFX12-NEXT: s_mov_b32 s4, s32
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s4, s32
; GFX12-NEXT: s_lshl_b32 s0, s0, 5
; GFX12-NEXT: v_mov_b32_e32 v40, 0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -417,8 +409,7 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
; GFX942-LABEL: test_call_and_alloca_var_uniform:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX942-NEXT: s_and_b32 s0, s0, -16
; GFX942-NEXT: s_lshl_b32 s2, s0, 6
; GFX942-NEXT: s_getpc_b64 s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 9a4040a25419a..49977a4c64784 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -265,8 +265,7 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
; GFX9-NEXT: v_mov_b32_e32 v0, 7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_i32 s2, s2, s3
-; GFX9-NEXT: s_lshl_b32 s2, s2, 2
-; GFX9-NEXT: s_add_i32 s0, s0, s2
+; GFX9-NEXT: s_lshl2_add_u32 s0, s2, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
@@ -282,9 +281,8 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s0, s0, s2
+; GFX11-NEXT: s_lshl2_add_u32 s0, s2, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index c5db7a33f70e0..9f2001d452fe3 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -13,8 +13,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400
; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
@@ -53,12 +52,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -88,13 +86,12 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i
; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0
-; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
; GFX9-SDAG-NEXT: s_add_i32 s5, s32, 0x1fff
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0xffffe000
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 10
@@ -137,12 +134,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -178,8 +174,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400
; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
@@ -218,12 +213,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -609,8 +603,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0
-; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2
-; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15
; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
@@ -639,8 +632,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB6_4: ; %bb.1
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
@@ -719,20 +711,17 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_4
; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15
; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff
-; GFX11-SDAG-NEXT: s_add_i32 s1, s1, 15
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_and_b32 s4, s1, -16
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX11-SDAG-NEXT: s_and_b32 s1, s3, 0xfffff800
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX11-SDAG-NEXT: s_lshl_b32 s3, s4, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
; GFX11-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4
@@ -750,18 +739,16 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: .LBB6_4: ; %bb.1
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -866,9 +853,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_5
; GFX9-SDAG-NEXT: .LBB7_4: ; %bb.0
-; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2
; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0xfff
-; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xfffff000
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16
; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6
@@ -964,16 +950,15 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_5
; GFX11-SDAG-NEXT: .LBB7_4: ; %bb.0
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s1, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
-; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0x7ff
-; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff800
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0x7ff
+; GFX11-SDAG-NEXT: s_and_b32 s1, s1, -16
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff800
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s0, s1
; GFX11-SDAG-NEXT: .LBB7_5: ; %bb.2
; GFX11-SDAG-NEXT: s_endpgm
; GFX11-SDAG-NEXT: .LBB7_6:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index b0e6752386285..e01cb79382c05 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -524,7 +524,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
@@ -695,7 +695,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -875,7 +875,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1054,7 +1054,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
@@ -1225,7 +1225,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1405,7 +1405,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 3eef616ba267d..ad894ce36c55b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -97,8 +97,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_lshl_b32 s0, s0, 2
-; CHECK-NEXT: s_add_i32 s0, s0, 4
+; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 4
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: ds_write_b16 v1, v0
; CHECK-NEXT: ds_write_b32 v2, v1
@@ -136,10 +135,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT: s_lshl_b32 s4, s17, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_add_i32 s4, s4, 4
+; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 4
; CHECK-NEXT: v_mov_b32_e32 v2, 2
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: ds_write_b16 v1, v0
@@ -163,8 +161,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_lshl_b32 s0, s0, 2
-; CHECK-NEXT: s_add_i32 s0, s0, 4
+; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 4
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: ds_write_b16 v1, v0
; CHECK-NEXT: ds_write_b32 v2, v1
@@ -202,10 +199,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT: s_lshl_b32 s4, s17, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_add_i32 s4, s4, 8
+; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8
; CHECK-NEXT: v_mov_b32_e32 v2, 2
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: ds_write_b16 v1, v0
@@ -229,8 +225,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_lshl_b32 s0, s0, 2
-; CHECK-NEXT: s_add_i32 s0, s0, 8
+; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 8
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: ds_write_b16 v1, v0
; CHECK-NEXT: ds_write_b32 v2, v1
@@ -268,10 +263,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT: s_lshl_b32 s4, s17, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_add_i32 s4, s4, 8
+; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8
; CHECK-NEXT: v_mov_b32_e32 v2, 2
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: ds_write_b16 v1, v0
@@ -295,8 +289,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_lshl_b32 s0, s0, 2
-; CHECK-NEXT: s_add_i32 s0, s0, 8
+; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 8
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: ds_write_b16 v1, v0
; CHECK-NEXT: ds_write_b32 v2, v1
@@ -334,10 +327,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT: s_lshl_b32 s4, s17, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_add_i32 s4, s4, 8
+; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8
; CHECK-NEXT: v_mov_b32_e32 v2, 2
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: ds_write_b16 v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
index 3c55dcb486675..91489d76b18f6 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
@@ -6,8 +6,8 @@
; ELF: Relocations [
; ELF-NEXT: Section (3) .rel.text {
-; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.external
-; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.defined
+; ELF-NEXT: 0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.external
+; ELF-NEXT: 0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.defined
; ELF-NEXT: }
; ELF-NEXT: ]
@@ -35,7 +35,7 @@
; GCN: v_mov_b32_e32 v1, lds.external at abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
; GCN-NEXT: ; fixup A - offset: 4, value: lds.external at abs32@lo, kind: FK_Data_4{{$}}
;
-; GCN: s_add_i32 s0, s0, lds.defined at abs32@lo ; encoding: [0x00,0xff,0x00,0x81,A,A,A,A]
+; GCN: s_lshl2_add_u32 s0, s2, lds.defined at abs32@lo ; encoding: [0x02,0xff,0x80,0x97,A,A,A,A]
; GCN-NEXT: ; fixup A - offset: 4, value: lds.defined at abs32@lo, kind: FK_Data_4{{$}}
;
; GCN: .globl lds.external
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
index 69439d49e588f..de82dcdecda48 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
@@ -102,10 +102,9 @@ define void @test_workgroup_id_x_non_kernel_optimized_fixed(ptr addrspace(1) %ou
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: s_lshl_b32 s0, ttmp9, 1
-; GFX1250-SDAG-NEXT: s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT: s_and_b32 s0, ttmp6, 15
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT: s_lshl1_add_u32 s0, ttmp9, s0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v2, off
; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
index 497241cff392d..6b6658bd672de 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
@@ -234,19 +234,18 @@ define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" {
;
; GFX1250-SDAG-LABEL: workgroup_id_optimized:
; GFX1250-SDAG: ; %bb.0: ; %.entry
-; GFX1250-SDAG-NEXT: s_lshl_b32 s0, ttmp9, 1
-; GFX1250-SDAG-NEXT: s_and_b32 s1, ttmp6, 15
-; GFX1250-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 14
-; GFX1250-SDAG-NEXT: s_add_co_i32 s1, s1, s0
-; GFX1250-SDAG-NEXT: s_and_b32 s0, s2, 0x3fffc
+; GFX1250-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 14
; GFX1250-SDAG-NEXT: s_and_b32 s2, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT: s_and_b32 s0, ttmp6, 15
+; GFX1250-SDAG-NEXT: s_and_b32 s1, s1, 0x3fffc
; GFX1250-SDAG-NEXT: s_bfe_u32 s3, ttmp6, 0x40008
; GFX1250-SDAG-NEXT: s_mul_i32 s2, s2, 3
; GFX1250-SDAG-NEXT: s_bfe_u32 s4, ttmp6, 0x40004
-; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, s0
+; GFX1250-SDAG-NEXT: s_lshl1_add_u32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, s1
; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s4, s2
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s4
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX1250-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null
; GFX1250-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index 4b5a7c207055a..8dea9e87e140f 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -1620,15 +1620,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_1-NEXT: s_lshl_b32 s4, s16, 2
-; GFX10_1-NEXT: s_lshr_b32 s55, s32, 5
-; GFX10_1-NEXT: s_add_i32 s55, s55, s4
+; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5
+; GFX10_1-NEXT: s_addk_i32 s4, 0x4040
+; GFX10_1-NEXT: s_lshl2_add_u32 s55, s16, s4
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
-; GFX10_1-NEXT: s_addk_i32 s55, 0x4040
+; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use s55, scc
; GFX10_1-NEXT: ;;#ASMEND
@@ -1650,15 +1649,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_3-NEXT: s_lshl_b32 s4, s16, 2
-; GFX10_3-NEXT: s_lshr_b32 s55, s32, 5
-; GFX10_3-NEXT: s_add_i32 s55, s55, s4
+; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5
+; GFX10_3-NEXT: s_addk_i32 s4, 0x4040
+; GFX10_3-NEXT: s_lshl2_add_u32 s55, s16, s4
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
-; GFX10_3-NEXT: s_addk_i32 s55, 0x4040
+; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use s55, scc
; GFX10_3-NEXT: ;;#ASMEND
@@ -1677,15 +1675,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX11-NEXT: s_add_i32 s2, s32, 0x8040
; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: s_add_i32 s1, s32, 64
; GFX11-NEXT: v_writelane_b32 v1, s55, 0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-NEXT: s_add_i32 s1, s32, 64
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-NEXT: s_add_i32 s55, s32, s0
+; GFX11-NEXT: s_add_i32 s1, s32, 0x4040
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_addk_i32 s55, 0x4040
+; GFX11-NEXT: s_lshl2_add_u32 s55, s0, s1
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s55, scc
@@ -1710,16 +1708,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s1
; GFX12-NEXT: v_writelane_b32 v1, s55, 0
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_add_co_i32 s1, s32, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_add_co_i32 s55, s32, s0
+; GFX12-NEXT: s_lshl2_add_u32 s55, s0, s1
+; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_addk_co_i32 s55, 0x4000
-; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s55, scc
; GFX12-NEXT: ;;#ASMEND
@@ -1767,11 +1763,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: s_lshr_b32 s4, s32, 6
+; GFX900-NEXT: s_addk_i32 s4, 0x4040
; GFX900-NEXT: v_writelane_b32 v1, s55, 0
-; GFX900-NEXT: s_lshl_b32 s4, s16, 2
-; GFX900-NEXT: s_lshr_b32 s55, s32, 6
-; GFX900-NEXT: s_add_i32 s55, s55, s4
-; GFX900-NEXT: s_addk_i32 s55, 0x4040
+; GFX900-NEXT: s_lshl2_add_u32 s55, s16, s4
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: ;;#ASMSTART
@@ -1796,10 +1791,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX942-NEXT: s_add_i32 s1, s32, 0x8040
; GFX942-NEXT: scratch_store_dword off, v1, s1 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-NEXT: s_add_i32 s1, s32, 0x4040
; GFX942-NEXT: v_writelane_b32 v1, s55, 0
-; GFX942-NEXT: s_add_i32 s55, s32, s0
-; GFX942-NEXT: s_addk_i32 s55, 0x4040
+; GFX942-NEXT: s_lshl2_add_u32 s55, s0, s1
; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
index 6e2d0f6503a20..7e2bfa666a19f 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -144,7 +144,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
; GFX9: global_load_dword [[VADDR:v[0-9]+]],
-; GFX9: v_lshlrev_b32_e32 [[ADDR:v[0-9]+]], 2, [[VADDR]]
+; GFX9: v_lshl_add_u32 [[ADDR:v[0-9]+]], [[VADDR]], 2, s{{[0-9]+}}
; GFX9-NOT [[ADDR]]
; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32
define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 627f4ada95dba..c1f52173c7451 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -33,11 +33,10 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
-; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
-; MUBUF-NEXT: s_add_i32 s6, s6, s7
+; MUBUF-NEXT: s_lshl2_add_u32 s6, s10, s6
; MUBUF-NEXT: v_mov_b32_e32 v2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -68,10 +67,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2
; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
-; FLATSCR-NEXT: s_add_i32 s2, s2, s3
+; FLATSCR-NEXT: s_lshl2_add_u32 s2, s6, s2
; FLATSCR-NEXT: scratch_load_dword v2, off, s2
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -132,12 +130,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: ; %bb.1: ; %bb.0
; MUBUF-NEXT: s_add_i32 s4, s32, 0xfff
; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000
-; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
; MUBUF-NEXT: v_mov_b32_e32 v3, 1
-; MUBUF-NEXT: s_add_i32 s4, s4, s5
+; MUBUF-NEXT: s_lshl2_add_u32 s4, s5, s4
; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
@@ -168,10 +165,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2
; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
-; FLATSCR-NEXT: s_add_i32 s0, s0, s1
+; FLATSCR-NEXT: s_lshl2_add_u32 s0, s1, s0
; FLATSCR-NEXT: scratch_load_dword v2, off, s0
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/shlN_add.ll
index 3e507a0c5889f..ba8ae9554d0e8 100644
--- a/llvm/test/CodeGen/AMDGPU/shlN_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/shlN_add.ll
@@ -14,8 +14,7 @@
define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
; GFX9-SDAG-LABEL: s_shl1_add_u32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s1
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl1_add_u32:
@@ -26,8 +25,7 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
;
; GFX10-SDAG-LABEL: s_shl1_add_u32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s1
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl1_add_u32:
@@ -53,8 +51,7 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
; GFX9-SDAG-LABEL: s_shl2_add_u32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s1
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl2_add_u32:
@@ -65,8 +62,7 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
;
; GFX10-SDAG-LABEL: s_shl2_add_u32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s1
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl2_add_u32:
@@ -92,8 +88,7 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
; GFX9-SDAG-LABEL: s_shl3_add_u32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s1
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl3_add_u32:
@@ -104,8 +99,7 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
;
; GFX10-SDAG-LABEL: s_shl3_add_u32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 3
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s1
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl3_add_u32:
@@ -131,8 +125,7 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) {
; GFX9-SDAG-LABEL: s_shl4_add_u32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s1
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl4_add_u32:
@@ -143,8 +136,7 @@ define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) {
;
; GFX10-SDAG-LABEL: s_shl4_add_u32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s1
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl4_add_u32:
@@ -598,10 +590,8 @@ define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
; GFX9-SDAG-LABEL: s_shl1_add_u32_v2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 1
-; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: s_lshl1_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s2
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl1_add_u32_v2:
@@ -614,10 +604,8 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
;
; GFX10-SDAG-LABEL: s_shl1_add_u32_v2:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 1
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_lshl1_add_u32 s1, s1, s3
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl1_add_u32_v2:
@@ -647,10 +635,8 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
; GFX9-SDAG-LABEL: s_shl2_add_u32_v2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 2
-; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl2_add_u32_v2:
@@ -663,10 +649,8 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
;
; GFX10-SDAG-LABEL: s_shl2_add_u32_v2:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 2
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_lshl2_add_u32 s1, s1, s3
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl2_add_u32_v2:
@@ -696,10 +680,8 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
; GFX9-SDAG-LABEL: s_shl3_add_u32_v2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 3
-; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 3
-; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: s_lshl3_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s2
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl3_add_u32_v2:
@@ -712,10 +694,8 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
;
; GFX10-SDAG-LABEL: s_shl3_add_u32_v2:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 3
-; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 3
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_lshl3_add_u32 s1, s1, s3
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl3_add_u32_v2:
@@ -745,10 +725,8 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
; GFX9-SDAG-LABEL: s_shl4_add_u32_v2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 4
-; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s2
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl4_add_u32_v2:
@@ -761,10 +739,8 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
;
; GFX10-SDAG-LABEL: s_shl4_add_u32_v2:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 4
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl4_add_u32_v2:
@@ -794,10 +770,8 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
; GFX9-SDAG-LABEL: s_shl_2_4_add_u32_v2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 4
-; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl_2_4_add_u32_v2:
@@ -810,10 +784,8 @@ define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32
;
; GFX10-SDAG-LABEL: s_shl_2_4_add_u32_v2:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 4
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl_2_4_add_u32_v2:
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 5aafb0f576fb4..90304b2c730cb 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -69,6 +69,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
@@ -92,7 +93,6 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
@@ -101,6 +101,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
@@ -113,10 +114,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_LSHL4_ADD_U32_:%[0-9]+]]:sreg_32 = S_LSHL4_ADD_U32 [[COPY12]], 16, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_LSHL4_ADD_U32_]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
@@ -127,25 +126,25 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_3]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_3]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_4]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
@@ -164,11 +163,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -185,11 +184,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
@@ -198,32 +197,32 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
- ; CHECK-NEXT: KILL undef %470:sreg_64
+ ; CHECK-NEXT: KILL undef %469:sreg_64
; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
- ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
- ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]]
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
@@ -236,10 +235,10 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
+ ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]]
- ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -310,15 +309,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_35]], [[V_ADD_U32_e64_17]], implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_36]], [[V_ADD_U32_e64_18]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_37]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_38]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_39]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_40]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_41]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_42]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_43]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_6]], [[V_OR_B32_e64_37]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_38]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_39]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_40]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_41]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_42]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_43]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_44]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_44]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_46:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_45]], [[V_ADD_U32_e64_19]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -459, [[BUFFER_LOAD_FORMAT_X_IDXEN21]], 0, implicit $exec
@@ -326,15 +325,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -466, [[BUFFER_LOAD_FORMAT_X_IDXEN22]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_48:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_47]], [[V_ADD_U32_e64_21]], implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_49:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_48]], [[V_ADD_U32_e64_22]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_49]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_50]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_51]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_52]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_53]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_54]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_55]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_56]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_57]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_49]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_50]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_51]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_52]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_53]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_54]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_55]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_56]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_57]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -555, [[BUFFER_LOAD_FORMAT_X_IDXEN23]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -556, [[BUFFER_LOAD_FORMAT_X_IDXEN24]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_59:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_58]], [[V_ADD_U32_e64_23]], implicit $exec
@@ -351,13 +350,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
- ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
- ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
- ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
+ ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_66]], implicit $exec
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
- ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+ ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
.expVert:
%0 = extractelement <31 x i32> %userData, i64 2
>From de2f09cb46b3852c082e94d32fe83d4113097f13 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Fri, 7 Nov 2025 15:48:11 -0600
Subject: [PATCH 2/5] Generate s_lshl?_add_u32
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 20 +++++++++-----------
1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index eb94b4b22e85f..9701079bb2761 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7905,21 +7905,19 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
+ unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
+ : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
+ : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
+ : 4);
- dbgs() << "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG\n";
- Inst.dump();
- unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1 :
- Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2 :
- Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3 : 4);
-
const TargetRegisterClass *NewRC =
RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
Register DestReg = MRI.createVirtualRegister(NewRC);
- MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
- .add(Src0)
- .addImm(ShiftAmt)
- .add(Src1);
-
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+ .add(Src0)
+ .addImm(ShiftAmt)
+ .add(Src1);
legalizeOperands(*NewInstr, MDT);
MRI.replaceRegWith(Dest.getReg(), DestReg);
>From 7610a5db285f9901c066c4d5890a46af9e4a72ed Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Fri, 7 Nov 2025 16:42:51 -0600
Subject: [PATCH 3/5] Test moving to VALU
Signed-off-by: John Lu <John.Lu at amd.com>
---
.../CodeGen/AMDGPU/move-to-valu-lshl_add.ll | 90 +++++++++++++++++++
1 file changed, 90 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
new file mode 100644
index 0000000000000..b2d9fe667d958
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies < %s | FileCheck %s
+
+define amdgpu_kernel void @lshl1_add(ptr addrspace(5) %alloca) {
+ ; CHECK-LABEL: name: lshl1_add
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 1, killed [[S_LOAD_DWORD_IMM]], implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_SHORT killed [[V_MOV_B32_e32_]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %1 = sext i32 %vaddr to i64
+ %gep = getelementptr i16, ptr addrspace(5) %alloca, i64 %1
+ store i16 0, ptr addrspace(5) %gep, align 2
+ ret void
+}
+
+define amdgpu_kernel void @lshl2_add(ptr addrspace(5) %alloca) {
+ ; CHECK-LABEL: name: lshl2_add
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 2, killed [[S_LOAD_DWORD_IMM]], implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD killed [[V_MOV_B32_e32_]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %1 = sext i32 %vaddr to i64
+ %gep = getelementptr i32, ptr addrspace(5) %alloca, i64 %1
+ store i32 0, ptr addrspace(5) %gep, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lshl3_add(ptr addrspace(5) %alloca) {
+ ; CHECK-LABEL: name: lshl3_add
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 3, killed [[S_LOAD_DWORD_IMM]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: SCRATCH_STORE_DWORDX2 killed [[REG_SEQUENCE]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %1 = sext i32 %vaddr to i64
+ %gep = getelementptr i64, ptr addrspace(5) %alloca, i64 %1
+ store i64 0, ptr addrspace(5) %gep, align 8
+ ret void
+}
+
+define amdgpu_kernel void @lshl4_add(ptr addrspace(5) %alloca) {
+ ; CHECK-LABEL: name: lshl4_add
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 4, killed [[S_LOAD_DWORD_IMM]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK-NEXT: SCRATCH_STORE_DWORDX4 killed [[REG_SEQUENCE]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %1 = sext i32 %vaddr to i64
+ %gep = getelementptr i128, ptr addrspace(5) %alloca, i64 %1
+ store i128 0, ptr addrspace(5) %gep, align 16
+ ret void
+}
>From 54c53e1447d8eec426ee01dca8f6a8f35102a7ac Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Mon, 10 Nov 2025 09:01:26 -0600
Subject: [PATCH 4/5] Avoid %alloca and load from null
Signed-off-by: John Lu <John.Lu at amd.com>
---
.../CodeGen/AMDGPU/move-to-valu-lshl_add.ll | 160 +++++++++++++-----
1 file changed, 116 insertions(+), 44 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
index b2d9fe667d958..63eb18f195671 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
@@ -1,90 +1,162 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies < %s | FileCheck %s
-define amdgpu_kernel void @lshl1_add(ptr addrspace(5) %alloca) {
+define amdgpu_kernel void @lshl1_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
; CHECK-LABEL: name: lshl1_add
; CHECK: bb.0 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr4_sgpr5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
- ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
- ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 1, killed [[S_LOAD_DWORD_IMM]], implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: SCRATCH_STORE_SHORT killed [[V_MOV_B32_e32_]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.in.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s128) from %ir.in2.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 84, 0 :: (dereferenceable invariant load (s32) from %ir.in2.kernarg.offset + 16, addrspace 4)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, killed [[S_LOAD_DWORD_IMM]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub1
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub0
+ ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY6]], %subreg.sub2, killed [[COPY5]], %subreg.sub3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.in.load, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], 1, killed [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHL_ADD_U32_e64_]]
+ ; CHECK-NEXT: BUFFER_STORE_SHORT_VBUFFER_OFFEN_exact [[COPY9]], killed [[COPY10]], killed [[REG_SEQUENCE4]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.in2.load.ptr.rsrc, align 1, addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
- %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
%1 = sext i32 %vaddr to i64
- %gep = getelementptr i16, ptr addrspace(5) %alloca, i64 %1
- store i16 0, ptr addrspace(5) %gep, align 2
+ %gep = getelementptr i16, ptr addrspace(7) %in2, i64 %1
+ store i16 0, ptr addrspace(7) %gep, align 2
ret void
}
-define amdgpu_kernel void @lshl2_add(ptr addrspace(5) %alloca) {
+define amdgpu_kernel void @lshl2_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
; CHECK-LABEL: name: lshl2_add
; CHECK: bb.0 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr4_sgpr5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
- ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
- ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 2, killed [[S_LOAD_DWORD_IMM]], implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: SCRATCH_STORE_DWORD killed [[V_MOV_B32_e32_]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.in.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s128) from %ir.in2.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 84, 0 :: (dereferenceable invariant load (s32) from %ir.in2.kernarg.offset + 16, addrspace 4)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, killed [[S_LOAD_DWORD_IMM]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub1
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub0
+ ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY6]], %subreg.sub2, killed [[COPY5]], %subreg.sub3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.in.load, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], 2, killed [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHL_ADD_U32_e64_]]
+ ; CHECK-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY9]], killed [[COPY10]], killed [[REG_SEQUENCE4]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.in2.load.ptr.rsrc, align 1, addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
- %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
%1 = sext i32 %vaddr to i64
- %gep = getelementptr i32, ptr addrspace(5) %alloca, i64 %1
- store i32 0, ptr addrspace(5) %gep, align 4
+ %gep = getelementptr i32, ptr addrspace(7) %in2, i64 %1
+ store i32 0, ptr addrspace(7) %gep, align 4
ret void
}
-define amdgpu_kernel void @lshl3_add(ptr addrspace(5) %alloca) {
+define amdgpu_kernel void @lshl3_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
; CHECK-LABEL: name: lshl3_add
; CHECK: bb.0 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr4_sgpr5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
- ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
- ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 3, killed [[S_LOAD_DWORD_IMM]], implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.in.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s128) from %ir.in2.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 84, 0 :: (dereferenceable invariant load (s32) from %ir.in2.kernarg.offset + 16, addrspace 4)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; CHECK-NEXT: SCRATCH_STORE_DWORDX2 killed [[REG_SEQUENCE]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, killed [[S_LOAD_DWORD_IMM]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub1
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub0
+ ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY6]], %subreg.sub2, killed [[COPY5]], %subreg.sub3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.in.load, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], 3, killed [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHL_ADD_U32_e64_]]
+ ; CHECK-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact killed [[V_MOV_B]], killed [[COPY9]], killed [[REG_SEQUENCE4]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.in2.load.ptr.rsrc, align 1, addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
- %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
%1 = sext i32 %vaddr to i64
- %gep = getelementptr i64, ptr addrspace(5) %alloca, i64 %1
- store i64 0, ptr addrspace(5) %gep, align 8
+ %gep = getelementptr i64, ptr addrspace(7) %in2, i64 %1
+ store i64 0, ptr addrspace(7) %gep, align 8
ret void
}
-define amdgpu_kernel void @lshl4_add(ptr addrspace(5) %alloca) {
+define amdgpu_kernel void @lshl4_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
; CHECK-LABEL: name: lshl4_add
; CHECK: bb.0 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr4_sgpr5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
- ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
- ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 4, killed [[S_LOAD_DWORD_IMM]], implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.in.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s128) from %ir.in2.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 84, 0 :: (dereferenceable invariant load (s32) from %ir.in2.kernarg.offset + 16, addrspace 4)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; CHECK-NEXT: SCRATCH_STORE_DWORDX4 killed [[REG_SEQUENCE]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, killed [[S_LOAD_DWORD_IMM]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub1
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub0
+ ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY6]], %subreg.sub2, killed [[COPY5]], %subreg.sub3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.in.load, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], 4, killed [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_LSHL_ADD_U32_e64_]]
+ ; CHECK-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFEN_exact killed [[REG_SEQUENCE5]], killed [[COPY13]], killed [[REG_SEQUENCE4]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.in2.load.ptr.rsrc, align 1, addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
- %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
%1 = sext i32 %vaddr to i64
- %gep = getelementptr i128, ptr addrspace(5) %alloca, i64 %1
- store i128 0, ptr addrspace(5) %gep, align 16
+ %gep = getelementptr i128, ptr addrspace(7) %in2, i64 %1
+ store i128 0, ptr addrspace(7) %gep, align 16
ret void
}
>From b9d1c06a81f6f9c733209fc1495d60d39577741b Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Mon, 10 Nov 2025 11:44:06 -0600
Subject: [PATCH 5/5] Remove new pattern. Enable existing pattern for
SelectionDAG.
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 6 +-
llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 -
.../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 225 +++++++++--------
llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 102 ++++----
.../CodeGen/AMDGPU/gep-flags-stack-offsets.ll | 24 +-
.../CodeGen/AMDGPU/hip.extern.shared.array.ll | 12 +-
llvm/test/CodeGen/AMDGPU/lds-relocs.ll | 2 +-
.../memory-legalizer-local-nontemporal.ll | 238 ++++++++----------
.../AMDGPU/memory-legalizer-local-volatile.ll | 142 +++++------
.../memory-legalizer-private-nontemporal.ll | 168 ++++++-------
.../memory-legalizer-private-volatile.ll | 72 +++---
llvm/test/CodeGen/AMDGPU/wqm.ll | 4 +-
12 files changed, 453 insertions(+), 550 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b7f63eceb5d5c..0bde5d3fd2f26 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -776,11 +776,7 @@ def xnor : PatFrag <
foreach I = 1-4 in {
def shl#I#_add : PatFrag <
(ops node:$src0, node:$src1),
- (add (shl_oneuse $src0, (i32 I)), $src1)> {
- // FIXME: Poor substitute for disabling pattern in SelectionDAG
- let PredicateCode = [{return false;}];
- let GISelPredicateCode = [{return true;}];
-}
+ (add (shl_oneuse $src0, (i32 I)), $src1)>;
}
multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 93cfd5ab3750c..1931e0be15152 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2012,14 +2012,6 @@ let AddedComplexity = 20 in {
>;
}
-let SubtargetPredicate = isGFX9Plus in
-foreach I = 1-4 in {
-def : GCNPat <
- (i32 (UniformBinFrag<add> (shl_oneuse i32:$src0, (i32 I)), i32:$src1)),
- (!cast<SOP2_Pseudo>("S_LSHL"#I#"_ADD_U32") $src0, $src1)
->;
-}
-
// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
def : GCNPat <
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 9f2001d452fe3..d19a260db3550 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -361,26 +361,26 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() {
; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_and_b32 s6, s4, 0xffffe000
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s7, 0
; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
-; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
; GFX9-SDAG-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8
+; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB4_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0
; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -1156,35 +1156,35 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s33
; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s34
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s34
+; GFX9-SDAG-NEXT: s_mov_b32 s34, s32
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_and_b32 s6, s4, 0xffffe000
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s7, 0
; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000
-; GFX9-SDAG-NEXT: s_mov_b32 s34, s32
-; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000
; GFX9-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8
+; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0
; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10
; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s34
-; GFX9-SDAG-NEXT: s_mov_b32 s34, s10
-; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT: s_mov_b32 s34, s11
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s10
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
@@ -1225,34 +1225,35 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s33
; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s34
-; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
-; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s34
; GFX11-SDAG-NEXT: s_mov_b32 s34, s32
; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000
+; GFX11-SDAG-NEXT: s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80
; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1
; GFX11-SDAG-NEXT: ; %bb.2:
-; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 10
-; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff000
-; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s34
-; GFX11-SDAG-NEXT: s_mov_b32 s34, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s34, s6
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
@@ -1835,20 +1836,20 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-SDAG-NEXT: s_cbranch_execz .LBB14_6
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000
; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
; GFX9-SDAG-NEXT: s_mov_b32 s10, 0
; GFX9-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7]
-; GFX9-SDAG-NEXT: v_readlane_b32 s11, v1, s9
-; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9
-; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s11
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s11, s[6:7]
+; GFX9-SDAG-NEXT: v_readlane_b32 s12, v1, s11
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s11
+; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s12
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_2
; GFX9-SDAG-NEXT: ; %bb.3:
-; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
-; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9
; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1
; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
@@ -1879,7 +1880,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: .LBB14_6: ; %bb.1
; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 2
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
; GFX9-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1
@@ -1897,7 +1897,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s34
; GFX9-SDAG-NEXT: s_mov_b32 s34, s14
@@ -2001,27 +2002,26 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6
; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0
; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800
; GFX11-SDAG-NEXT: s_mov_b32 s3, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
; GFX11-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s2
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s5, s4
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: v_readlane_b32 s5, v1, s4
-; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s4
-; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s5
-; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: v_readlane_b32 s6, v1, s5
+; GFX11-SDAG-NEXT: s_bitset0_b32 s4, s5
+; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s6
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2
; GFX11-SDAG-NEXT: ; %bb.3:
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31
-; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo
-; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s3, 5, s2
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo
; GFX11-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1
@@ -2044,31 +2044,30 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
; GFX11-SDAG-NEXT: .LBB14_6: ; %bb.1
; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v0, 2, 15
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
; GFX11-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1
; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7
; GFX11-SDAG-NEXT: ; %bb.8:
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 1
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 dlc
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s7
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s34
; GFX11-SDAG-NEXT: s_mov_b32 s34, s8
-; GFX11-SDAG-NEXT: s_mov_b32 s33, s7
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
@@ -2174,9 +2173,9 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s33
+; GFX9-SDAG-NEXT: s_mov_b32 s12, s33
; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0
-; GFX9-SDAG-NEXT: s_mov_b32 s12, s34
+; GFX9-SDAG-NEXT: s_mov_b32 s13, s34
; GFX9-SDAG-NEXT: s_mov_b32 s8, 0
; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000
@@ -2186,24 +2185,24 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_4
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15
+; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
; GFX9-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7]
-; GFX9-SDAG-NEXT: v_readlane_b32 s10, v1, s9
-; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9
-; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s10, s[6:7]
+; GFX9-SDAG-NEXT: v_readlane_b32 s11, v0, s10
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s10
+; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s11
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_2
; GFX9-SDAG-NEXT: ; %bb.3:
-; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
-; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0xfffff000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v2, s8, 6, v1
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v2
-; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s9
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s8, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 2
+; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: ; implicit-def: $vgpr31
; GFX9-SDAG-NEXT: .LBB15_4: ; %Flow
@@ -2233,8 +2232,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2
; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-SDAG-NEXT: s_mov_b32 s32, s34
-; GFX9-SDAG-NEXT: s_mov_b32 s34, s12
-; GFX9-SDAG-NEXT: s_mov_b32 s33, s11
+; GFX9-SDAG-NEXT: s_mov_b32 s34, s13
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s12
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
@@ -2306,9 +2305,9 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s33
; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63
-; GFX11-SDAG-NEXT: s_mov_b32 s6, s34
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s34
; GFX11-SDAG-NEXT: s_mov_b32 s1, 0
; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63
@@ -2318,28 +2317,28 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_4
; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.1
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15
+; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
; GFX11-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: v_readlane_b32 s4, v1, s3
-; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
-; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4
+; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4
+; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s5
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2
; GFX11-SDAG-NEXT: ; %bb.3:
-; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s2
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 2
; GFX11-SDAG-NEXT: ; implicit-def: $vgpr31
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s1, 5, s2
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
; GFX11-SDAG-NEXT: .LBB15_4: ; %Flow
; GFX11-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_8
@@ -2368,8 +2367,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2
; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s34
-; GFX11-SDAG-NEXT: s_mov_b32 s34, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s34, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s6
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index fc8883924dfbc..870b679a84d11 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -857,13 +857,13 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX9-LABEL: store_load_vindex_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s32
-; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-NEXT: s_mov_b32 s0, s32
+; GFX9-NEXT: v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT: scratch_store_dword v2, v3, off
+; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -915,13 +915,13 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX9-PAL-LABEL: store_load_vindex_foo:
; GFX9-PAL: ; %bb.0: ; %bb
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32
-; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT: s_mov_b32 s0, s32
+; GFX9-PAL-NEXT: v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
@@ -929,8 +929,8 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX942-LABEL: store_load_vindex_foo:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, s32
-; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1
+; GFX942-NEXT: s_mov_b32 s0, s32
+; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, s0
; GFX942-NEXT: v_mov_b32_e32 v2, 15
; GFX942-NEXT: v_and_b32_e32 v0, 15, v0
; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1
@@ -2146,16 +2146,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX9-LABEL: store_load_vindex_small_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s1, s32, 0x100
; GFX9-NEXT: scratch_load_dword v1, off, s32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_i32 s0, s32, 0x100
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-NEXT: s_mov_b32 s0, s1
+; GFX9-NEXT: v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT: scratch_store_dword v2, v3, off
+; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2214,16 +2214,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
; GFX9-PAL: ; %bb.0: ; %bb
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: s_add_i32 s1, s32, 0x100
; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x100
-; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT: s_mov_b32 s0, s1
+; GFX9-PAL-NEXT: v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
@@ -2231,11 +2231,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX942-LABEL: store_load_vindex_small_offset_foo:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_add_i32 s1, s32, 0x100
; GFX942-NEXT: scratch_load_dword v1, off, s32 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: s_add_i32 s0, s32, 0x100
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1
+; GFX942-NEXT: s_mov_b32 s0, s1
+; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, s0
; GFX942-NEXT: v_mov_b32_e32 v2, 15
; GFX942-NEXT: v_and_b32_e32 v0, 15, v0
; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1
@@ -3447,16 +3447,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX9-LABEL: store_load_vindex_large_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s1, s32, 0x4004
; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-NEXT: s_mov_b32 s0, s1
+; GFX9-NEXT: v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT: scratch_store_dword v2, v3, off
+; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3516,16 +3516,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
; GFX9-PAL: ; %bb.0: ; %bb
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: s_add_i32 s1, s32, 0x4004
; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT: s_mov_b32 s0, s1
+; GFX9-PAL-NEXT: v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
@@ -3533,11 +3533,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX942-LABEL: store_load_vindex_large_offset_foo:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_add_i32 s1, s32, 0x4004
; GFX942-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX942-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, v1
+; GFX942-NEXT: s_mov_b32 s0, s1
+; GFX942-NEXT: v_lshl_add_u32 v1, v0, 2, s0
; GFX942-NEXT: v_mov_b32_e32 v2, 15
; GFX942-NEXT: v_and_b32_e32 v0, 15, v0
; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1
@@ -3940,12 +3940,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 15
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, 15
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc
@@ -4001,15 +4001,15 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX9-PAL-NEXT: s_getpc_b64 s[12:13]
; GFX9-PAL-NEXT: s_mov_b32 s12, s0
; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0
-; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-PAL-NEXT: s_mov_b32 s1, 0
; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11
; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc
@@ -4020,11 +4020,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_mov_b32 s1, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 15
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX942-NEXT: v_lshl_add_u32 v0, v0, 2, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, 15
+; GFX942-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX942-NEXT: scratch_store_dword v0, v1, off offset:1024 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, v0, off offset:1024 sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
index b5f0b2ff9ef4c..61902b5fd4661 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
@@ -18,8 +18,8 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 {
; GFX9-LABEL: gep_noflags_alloca:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT: s_lshr_b32 s4, s32, 6
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -45,8 +45,8 @@ define void @gep_inbounds_alloca(i32 %idx, i32 %val) #0 {
; GFX9-LABEL: gep_inbounds_alloca:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT: s_lshr_b32 s4, s32, 6
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -72,8 +72,8 @@ define void @gep_nuw_alloca(i32 %idx, i32 %val) #0 {
; GFX9-LABEL: gep_nuw_alloca:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT: s_lshr_b32 s4, s32, 6
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -99,8 +99,8 @@ define void @gep_nusw_alloca(i32 %idx, i32 %val) #0 {
; GFX9-LABEL: gep_nusw_alloca:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT: s_lshr_b32 s4, s32, 6
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -126,8 +126,8 @@ define void @gep_inbounds_nuw_alloca(i32 %idx, i32 %val) #0 {
; GFX9-LABEL: gep_inbounds_nuw_alloca:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT: s_lshr_b32 s4, s32, 6
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -153,8 +153,8 @@ define void @gep_nusw_nuw_alloca(i32 %idx, i32 %val) #0 {
; GFX9-LABEL: gep_nusw_nuw_alloca:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT: s_lshr_b32 s4, s32, 6
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
index 8bd6c0f2652cf..d24b3a23cb9cd 100644
--- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
@@ -22,7 +22,7 @@ define amdgpu_kernel void @dynamic_shared_array_0(ptr addrspace(1) %out) {
}
; CHECK-LABEL: {{^}}dynamic_shared_array_1:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0xc00
; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_1(ptr addrspace(1) %out, i32 %cond) {
entry:
@@ -49,7 +49,7 @@ endif: ; preds = %else, %if
}
; CHECK-LABEL: {{^}}dynamic_shared_array_2:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x4000
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x4000
; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -64,7 +64,7 @@ define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
; The offset to the dynamic shared memory array should be aligned on the type
; specified.
; CHECK-LABEL: {{^}}dynamic_shared_array_3:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x44
; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -80,7 +80,7 @@ define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
; The offset to the dynamic shared memory array should be aligned on the
; maximal one.
; CHECK-LABEL: {{^}}dynamic_shared_array_4:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x48
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
@@ -99,7 +99,7 @@ define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
; Honor the explicit alignment from the specified variable.
; CHECK-LABEL: {{^}}dynamic_shared_array_5:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x44
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
@@ -118,7 +118,7 @@ define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
; Honor the explicit alignment from the specified variable.
; CHECK-LABEL: {{^}}dynamic_shared_array_6:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x50
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
index 91489d76b18f6..447cb62643384 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
@@ -32,7 +32,7 @@
; ELF-NEXT: }
; GCN-LABEL: {{^}}test_basic:
-; GCN: v_mov_b32_e32 v1, lds.external at abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
+; GCN: s_mov_b32 s0, lds.external at abs32@lo ; encoding: [0xff,0x00,0x80,0xbe,A,A,A,A]
; GCN-NEXT: ; fixup A - offset: 4, value: lds.external at abs32@lo, kind: FK_Data_4{{$}}
;
; GCN: s_lshl2_add_u32 s0, s2, lds.defined at abs32@lo ; encoding: [0x02,0xff,0x80,0x97,A,A,A,A]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index c6f7ce51f5ea2..9888204b997a9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -260,12 +260,11 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX10-WGP-LABEL: local_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT: s_mov_b32 s6, 2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX10-WGP-NEXT: ds_read_b32 v1, v1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
@@ -274,12 +273,11 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX10-CU-LABEL: local_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT: s_mov_b32 s6, 2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX10-CU-NEXT: ds_read_b32 v1, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -311,15 +309,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2
+; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
@@ -328,15 +324,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
-; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6
-; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2
+; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff
+; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
@@ -345,15 +339,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
@@ -362,15 +354,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX942-TGSPLIT-LABEL: local_nontemporal_load_1:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
-; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff
+; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
@@ -379,14 +369,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX11-WGP-LABEL: local_nontemporal_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT: s_mov_b32 s2, 2
+; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX11-WGP-NEXT: ds_load_b32 v1, v1
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -395,14 +384,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX11-CU-LABEL: local_nontemporal_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT: s_mov_b32 s2, 2
+; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX11-CU-NEXT: ds_load_b32 v1, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -411,15 +399,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX12-WGP-LABEL: local_nontemporal_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
-; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
-; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_wait_alu 0xfffe
-; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -428,15 +414,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX12-CU-LABEL: local_nontemporal_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
-; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_wait_alu 0xfffe
-; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX12-CU-NEXT: ds_load_b32 v1, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -445,14 +429,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX1250-LABEL: local_nontemporal_load_1:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
-; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX1250-NEXT: s_mov_b32 s2, 2
+; GFX1250-NEXT: s_mov_b32 s3, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX1250-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX1250-NEXT: ds_load_b32 v1, v1
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -679,12 +662,11 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX10-WGP-LABEL: local_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT: s_mov_b32 s5, 2
-; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s5
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
@@ -692,12 +674,11 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX10-CU-LABEL: local_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT: s_mov_b32 s5, 2
-; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s5
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
; GFX10-CU-NEXT: ds_write_b32 v0, v1
@@ -720,15 +701,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5
-; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s5, v1
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s5
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
@@ -736,15 +715,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff
-; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5
-; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s5, v1
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
+; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6
+; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s5
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
@@ -752,15 +729,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2
-; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s1, v1
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
@@ -768,15 +743,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX942-TGSPLIT-LABEL: local_nontemporal_store_1:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
-; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff
-; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 2
-; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s1, v1
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
+; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1
@@ -784,14 +757,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX11-WGP-LABEL: local_nontemporal_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
-; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT: s_mov_b32 s1, 2
-; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: ds_store_b32 v0, v1
@@ -799,14 +771,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX11-CU-LABEL: local_nontemporal_store_1:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT: s_mov_b32 s1, 2
-; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
@@ -814,15 +785,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX12-WGP-LABEL: local_nontemporal_store_1:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff
-; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX12-WGP-NEXT: s_mov_b32 s1, 2
-; GFX12-WGP-NEXT: s_wait_alu 0xfffe
-; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
@@ -830,15 +799,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX12-CU-LABEL: local_nontemporal_store_1:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff
-; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX12-CU-NEXT: s_mov_b32 s1, 2
-; GFX12-CU-NEXT: s_wait_alu 0xfffe
-; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
@@ -846,15 +813,14 @@ define amdgpu_kernel void @local_nontemporal_store_1(
;
; GFX1250-LABEL: local_nontemporal_store_1:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s1, 0x3ff
-; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX1250-NEXT: s_mov_b32 s1, 2
-; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, s0
; GFX1250-NEXT: ds_store_b32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index d686e7a2d5b4c..33c516c61e42c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -208,12 +208,11 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX10-WGP-LABEL: local_volatile_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT: s_mov_b32 s6, 2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX10-WGP-NEXT: ds_read_b32 v1, v1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
@@ -222,12 +221,11 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX10-CU-LABEL: local_volatile_load_1:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT: s_mov_b32 s6, 2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX10-CU-NEXT: ds_read_b32 v1, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -259,14 +257,13 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX11-WGP-LABEL: local_volatile_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT: s_mov_b32 s2, 2
+; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX11-WGP-NEXT: ds_load_b32 v1, v1
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -275,14 +272,13 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX11-CU-LABEL: local_volatile_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT: s_mov_b32 s2, 2
+; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX11-CU-NEXT: ds_load_b32 v1, v1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -291,15 +287,13 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX12-WGP-LABEL: local_volatile_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
-; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
-; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_wait_alu 0xfffe
-; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -308,15 +302,13 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX12-CU-LABEL: local_volatile_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
-; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_wait_alu 0xfffe
-; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX12-CU-NEXT: ds_load_b32 v1, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -325,14 +317,13 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX1250-LABEL: local_volatile_load_1:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
-; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX1250-NEXT: s_mov_b32 s2, 2
+; GFX1250-NEXT: s_mov_b32 s3, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX1250-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX1250-NEXT: ds_load_b32 v1, v1
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -511,12 +502,11 @@ define amdgpu_kernel void @local_volatile_store_1(
;
; GFX10-WGP-LABEL: local_volatile_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT: s_mov_b32 s5, 2
-; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s5
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
@@ -524,12 +514,11 @@ define amdgpu_kernel void @local_volatile_store_1(
;
; GFX10-CU-LABEL: local_volatile_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT: s_mov_b32 s5, 2
-; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s5
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
; GFX10-CU-NEXT: ds_write_b32 v0, v1
@@ -552,14 +541,13 @@ define amdgpu_kernel void @local_volatile_store_1(
;
; GFX11-WGP-LABEL: local_volatile_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
-; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT: s_mov_b32 s1, 2
-; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: ds_store_b32 v0, v1
@@ -567,14 +555,13 @@ define amdgpu_kernel void @local_volatile_store_1(
;
; GFX11-CU-LABEL: local_volatile_store_1:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT: s_mov_b32 s1, 2
-; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: ds_store_b32 v0, v1
@@ -582,15 +569,13 @@ define amdgpu_kernel void @local_volatile_store_1(
;
; GFX12-WGP-LABEL: local_volatile_store_1:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff
-; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX12-WGP-NEXT: s_mov_b32 s1, 2
-; GFX12-WGP-NEXT: s_wait_alu 0xfffe
-; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
@@ -598,15 +583,13 @@ define amdgpu_kernel void @local_volatile_store_1(
;
; GFX12-CU-LABEL: local_volatile_store_1:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff
-; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX12-CU-NEXT: s_mov_b32 s1, 2
-; GFX12-CU-NEXT: s_wait_alu 0xfffe
-; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
@@ -614,15 +597,14 @@ define amdgpu_kernel void @local_volatile_store_1(
;
; GFX1250-LABEL: local_volatile_store_1:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s1, 0x3ff
-; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX1250-NEXT: s_mov_b32 s1, 2
-; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, s0
; GFX1250-NEXT: ds_store_b32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 89de17ecbd1e8..6c19722ad6e33 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -270,12 +270,11 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT: s_mov_b32 s6, 2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
@@ -286,12 +285,11 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT: s_mov_b32 s6, 2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -330,15 +328,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2
+; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
@@ -349,15 +345,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
-; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6
-; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2
+; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff
+; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
@@ -366,15 +360,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v1, v1, off nt
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
@@ -383,15 +375,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX942-TGSPLIT-LABEL: private_nontemporal_load_1:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
-; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff
+; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
-; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX942-TGSPLIT-NEXT: scratch_load_dword v1, v1, off nt
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
@@ -400,14 +390,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX11-WGP-LABEL: private_nontemporal_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT: s_mov_b32 s2, 2
+; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -416,14 +405,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX11-CU-LABEL: private_nontemporal_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT: s_mov_b32 s2, 2
+; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -708,12 +696,11 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT: s_mov_b32 s5, 2
-; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s5
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -723,12 +710,11 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT: s_mov_b32 s5, 2
-; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s5
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -758,15 +744,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
-; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5
-; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2
-; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s5, v1
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s5
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -776,15 +760,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
-; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff
-; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5
-; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s5, v1
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff
+; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6
+; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s5
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -792,15 +774,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2
-; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s1, v1
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword v1, v0, off nt
@@ -808,15 +788,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX942-TGSPLIT-LABEL: private_nontemporal_store_1:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
-; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff
-; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 2
-; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
-; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s1, v1
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff
+; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, 2, s1
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX942-TGSPLIT-NEXT: scratch_store_dword v1, v0, off nt
@@ -824,14 +802,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX11-WGP-LABEL: private_nontemporal_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
-; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT: s_mov_b32 s1, 2
-; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s1
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: scratch_store_b32 v1, v0, off glc slc dlc
@@ -839,14 +816,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX11-CU-LABEL: private_nontemporal_store_1:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT: s_mov_b32 s1, 2
-; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: scratch_store_b32 v1, v0, off glc slc dlc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 7faa0621aa6d0..7c23b76cec3e9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -228,12 +228,11 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT: s_mov_b32 s6, 2
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
@@ -244,12 +243,11 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT: s_mov_b32 s6, 2
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6
; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
@@ -286,14 +284,13 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX11-WGP-LABEL: private_volatile_load_1:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT: s_mov_b32 s2, 2
+; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -302,14 +299,13 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX11-CU-LABEL: private_volatile_load_1:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT: s_mov_b32 s2, 2
+; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -578,12 +574,11 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT: s_mov_b32 s5, 2
-; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s5
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
@@ -594,12 +589,11 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT: s_mov_b32 s5, 2
-; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s5
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
@@ -629,14 +623,13 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX11-WGP-LABEL: private_volatile_store_1:
; GFX11-WGP: ; %bb.0: ; %entry
-; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT: s_mov_b32 s1, 2
-; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, 2, s1
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: scratch_store_b32 v1, v0, off dlc
@@ -645,14 +638,13 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX11-CU-LABEL: private_volatile_store_1:
; GFX11-CU: ; %bb.0: ; %entry
-; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT: s_mov_b32 s1, 2
-; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, 2, s1
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: scratch_store_b32 v1, v0, off dlc
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 21f0c008366a9..0fdc1a83dddbd 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2029,10 +2029,10 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
+; GFX9-W64-NEXT: s_mov_b32 s2, 0
; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, v1
+; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, s2
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
More information about the llvm-commits
mailing list