[llvm] [AMDGPU] Generate s_lshl?_add_u32 (PR #167032)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 7 14:45:47 PST 2025
https://github.com/LU-JOHN created https://github.com/llvm/llvm-project/pull/167032
Generate s_lshl?_add_u32 through SDAG.
>From a923825e39b1ca93abb986369180836ce7d90a15 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 6 Nov 2025 11:55:11 -0600
Subject: [PATCH 1/3] Generate shlN_add with sdag
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 36 ++++-
llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 ++
.../AMDGPU/amdgpu-cs-chain-fp-nosave.ll | 25 ++--
.../CodeGen/AMDGPU/atomic_cmp_swap_local.ll | 8 +-
.../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 69 ++++------
llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 12 +-
llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 24 ++--
llvm/test/CodeGen/AMDGPU/lds-relocs.ll | 6 +-
.../lower-work-group-id-intrinsics-opt.ll | 5 +-
.../AMDGPU/lower-work-group-id-intrinsics.ll | 13 +-
.../materialize-frame-index-sgpr.gfx10.ll | 46 +++----
.../CodeGen/AMDGPU/mubuf-offset-private.ll | 2 +-
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 12 +-
llvm/test/CodeGen/AMDGPU/shlN_add.ll | 84 ++++--------
.../AMDGPU/splitkit-getsubrangeformask.ll | 125 +++++++++---------
15 files changed, 217 insertions(+), 258 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9c74c654d8e35..eb94b4b22e85f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7635,6 +7635,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
unsigned Opcode = Inst.getOpcode();
unsigned NewOpcode = getVALUOp(Inst);
+ const DebugLoc &DL = Inst.getDebugLoc();
+
// Handle some special cases
switch (Opcode) {
default:
@@ -7872,7 +7874,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
- const DebugLoc &DL = Inst.getDebugLoc();
MachineOperand &Dest0 = Inst.getOperand(0);
MachineOperand &Dest1 = Inst.getOperand(1);
MachineOperand &Src0 = Inst.getOperand(2);
@@ -7897,7 +7898,36 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
}
return;
+ case AMDGPU::S_LSHL1_ADD_U32:
+ case AMDGPU::S_LSHL2_ADD_U32:
+ case AMDGPU::S_LSHL3_ADD_U32:
+ case AMDGPU::S_LSHL4_ADD_U32: {
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+
+ dbgs() << "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG\n";
+ Inst.dump();
+ unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1 :
+ Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2 :
+ Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3 : 4);
+
+ const TargetRegisterClass *NewRC =
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
+ Register DestReg = MRI.createVirtualRegister(NewRC);
+ MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+ .add(Src0)
+ .addImm(ShiftAmt)
+ .add(Src1);
+
+ legalizeOperands(*NewInstr, MDT);
+ MRI.replaceRegWith(Dest.getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
+ Worklist);
+ Inst.eraseFromParent();
+ }
+ return;
case AMDGPU::S_CSELECT_B32:
case AMDGPU::S_CSELECT_B64:
lowerSelect(Worklist, Inst, MDT);
@@ -7994,7 +8024,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
case AMDGPU::S_CVT_HI_F32_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
if (ST.useRealTrue16Insts()) {
@@ -8024,7 +8053,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F32:
case AMDGPU::S_MAXIMUM_F32: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
@@ -8042,7 +8070,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
case AMDGPU::S_MINIMUM_F16:
case AMDGPU::S_MAXIMUM_F16: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
@@ -8066,7 +8093,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
case AMDGPU::V_S_RCP_F16_e64:
case AMDGPU::V_S_RSQ_F16_e64:
case AMDGPU::V_S_SQRT_F16_e64: {
- const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
? &AMDGPU::VGPR_16RegClass
: &AMDGPU::VGPR_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..93cfd5ab3750c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2012,6 +2012,14 @@ let AddedComplexity = 20 in {
>;
}
+let SubtargetPredicate = isGFX9Plus in
+foreach I = 1-4 in {
+def : GCNPat <
+ (i32 (UniformBinFrag<add> (shl_oneuse i32:$src0, (i32 I)), i32:$src1)),
+ (!cast<SOP2_Pseudo>("S_LSHL"#I#"_ADD_U32") $src0, $src1)
+>;
+}
+
// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
def : GCNPat <
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
index 06150e4277e9a..7669ae21f6635 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -51,10 +51,8 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_add_co_i32 s0, s0, 15
; GFX12-NEXT: s_mov_b32 s32, 16
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, -16
@@ -69,8 +67,7 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
; GFX942-LABEL: test_alloca_var_uniform:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX942-NEXT: s_mov_b32 s32, 16
; GFX942-NEXT: s_and_b32 s0, s0, -16
; GFX942-NEXT: v_mov_b32_e32 v0, 0
@@ -211,15 +208,13 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
; GFX12-NEXT: s_add_co_u32 s2, s2, foo at gotpcrel32@lo+12
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo at gotpcrel32@hi+24
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: s_add_co_i32 s0, s0, 15
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_mov_b32 s32, 16
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, -16
-; GFX12-NEXT: s_mov_b32 s1, s32
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s1, s32
; GFX12-NEXT: s_lshl_b32 s0, s0, 5
; GFX12-NEXT: scratch_store_b32 off, v0, s1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -232,8 +227,7 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
; GFX942-LABEL: test_alloca_and_call_var_uniform:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX942-NEXT: s_and_b32 s0, s0, -16
; GFX942-NEXT: s_lshl_b32 s2, s0, 6
; GFX942-NEXT: s_getpc_b64 s[0:1]
@@ -396,14 +390,12 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
; GFX12-NEXT: s_add_co_u32 s2, s2, foo at gotpcrel32@lo+12
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo at gotpcrel32@hi+24
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: s_add_co_i32 s0, s0, 15
; GFX12-NEXT: s_mov_b32 s32, 16
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, -16
-; GFX12-NEXT: s_mov_b32 s4, s32
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s4, s32
; GFX12-NEXT: s_lshl_b32 s0, s0, 5
; GFX12-NEXT: v_mov_b32_e32 v40, 0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -417,8 +409,7 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
; GFX942-LABEL: test_call_and_alloca_var_uniform:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX942-NEXT: s_and_b32 s0, s0, -16
; GFX942-NEXT: s_lshl_b32 s2, s0, 6
; GFX942-NEXT: s_getpc_b64 s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 9a4040a25419a..49977a4c64784 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -265,8 +265,7 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
; GFX9-NEXT: v_mov_b32_e32 v0, 7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_i32 s2, s2, s3
-; GFX9-NEXT: s_lshl_b32 s2, s2, 2
-; GFX9-NEXT: s_add_i32 s0, s0, s2
+; GFX9-NEXT: s_lshl2_add_u32 s0, s2, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
@@ -282,9 +281,8 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s0, s0, s2
+; GFX11-NEXT: s_lshl2_add_u32 s0, s2, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index c5db7a33f70e0..9f2001d452fe3 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -13,8 +13,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400
; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
@@ -53,12 +52,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -88,13 +86,12 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i
; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0
-; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
; GFX9-SDAG-NEXT: s_add_i32 s5, s32, 0x1fff
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0xffffe000
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 10
@@ -137,12 +134,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -178,8 +174,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400
; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
@@ -218,12 +213,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -609,8 +603,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0
-; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2
-; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15
; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
@@ -639,8 +632,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB6_4: ; %bb.1
-; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
@@ -719,20 +711,17 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_4
; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15
; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff
-; GFX11-SDAG-NEXT: s_add_i32 s1, s1, 15
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_and_b32 s4, s1, -16
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX11-SDAG-NEXT: s_and_b32 s1, s3, 0xfffff800
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX11-SDAG-NEXT: s_lshl_b32 s3, s4, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
; GFX11-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4
@@ -750,18 +739,16 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: .LBB6_4: ; %bb.1
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -866,9 +853,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_5
; GFX9-SDAG-NEXT: .LBB7_4: ; %bb.0
-; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2
; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0xfff
-; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15
; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xfffff000
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16
; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6
@@ -964,16 +950,15 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_5
; GFX11-SDAG-NEXT: .LBB7_4: ; %bb.0
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s1, 2
+; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
-; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0x7ff
-; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff800
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0x7ff
+; GFX11-SDAG-NEXT: s_and_b32 s1, s1, -16
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff800
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s0, s1
; GFX11-SDAG-NEXT: .LBB7_5: ; %bb.2
; GFX11-SDAG-NEXT: s_endpgm
; GFX11-SDAG-NEXT: .LBB7_6:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index b0e6752386285..e01cb79382c05 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -524,7 +524,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
@@ -695,7 +695,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -875,7 +875,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT: s_lshl1_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1054,7 +1054,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
@@ -1225,7 +1225,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1405,7 +1405,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 0
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 3eef616ba267d..ad894ce36c55b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -97,8 +97,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_lshl_b32 s0, s0, 2
-; CHECK-NEXT: s_add_i32 s0, s0, 4
+; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 4
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: ds_write_b16 v1, v0
; CHECK-NEXT: ds_write_b32 v2, v1
@@ -136,10 +135,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT: s_lshl_b32 s4, s17, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_add_i32 s4, s4, 4
+; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 4
; CHECK-NEXT: v_mov_b32_e32 v2, 2
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: ds_write_b16 v1, v0
@@ -163,8 +161,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_lshl_b32 s0, s0, 2
-; CHECK-NEXT: s_add_i32 s0, s0, 4
+; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 4
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: ds_write_b16 v1, v0
; CHECK-NEXT: ds_write_b32 v2, v1
@@ -202,10 +199,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT: s_lshl_b32 s4, s17, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_add_i32 s4, s4, 8
+; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8
; CHECK-NEXT: v_mov_b32_e32 v2, 2
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: ds_write_b16 v1, v0
@@ -229,8 +225,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_lshl_b32 s0, s0, 2
-; CHECK-NEXT: s_add_i32 s0, s0, 8
+; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 8
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: ds_write_b16 v1, v0
; CHECK-NEXT: ds_write_b32 v2, v1
@@ -268,10 +263,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT: s_lshl_b32 s4, s17, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_add_i32 s4, s4, 8
+; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8
; CHECK-NEXT: v_mov_b32_e32 v2, 2
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: ds_write_b16 v1, v0
@@ -295,8 +289,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_lshl_b32 s0, s0, 2
-; CHECK-NEXT: s_add_i32 s0, s0, 8
+; CHECK-NEXT: s_lshl2_add_u32 s0, s0, 8
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: ds_write_b16 v1, v0
; CHECK-NEXT: ds_write_b32 v2, v1
@@ -334,10 +327,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT: s_lshl_b32 s4, s17, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_add_i32 s4, s4, 8
+; CHECK-NEXT: s_lshl2_add_u32 s4, s17, 8
; CHECK-NEXT: v_mov_b32_e32 v2, 2
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: ds_write_b16 v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
index 3c55dcb486675..91489d76b18f6 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
@@ -6,8 +6,8 @@
; ELF: Relocations [
; ELF-NEXT: Section (3) .rel.text {
-; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.external
-; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.defined
+; ELF-NEXT: 0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.external
+; ELF-NEXT: 0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.defined
; ELF-NEXT: }
; ELF-NEXT: ]
@@ -35,7 +35,7 @@
; GCN: v_mov_b32_e32 v1, lds.external at abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
; GCN-NEXT: ; fixup A - offset: 4, value: lds.external at abs32@lo, kind: FK_Data_4{{$}}
;
-; GCN: s_add_i32 s0, s0, lds.defined at abs32@lo ; encoding: [0x00,0xff,0x00,0x81,A,A,A,A]
+; GCN: s_lshl2_add_u32 s0, s2, lds.defined at abs32@lo ; encoding: [0x02,0xff,0x80,0x97,A,A,A,A]
; GCN-NEXT: ; fixup A - offset: 4, value: lds.defined at abs32@lo, kind: FK_Data_4{{$}}
;
; GCN: .globl lds.external
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
index 69439d49e588f..de82dcdecda48 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
@@ -102,10 +102,9 @@ define void @test_workgroup_id_x_non_kernel_optimized_fixed(ptr addrspace(1) %ou
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: s_lshl_b32 s0, ttmp9, 1
-; GFX1250-SDAG-NEXT: s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT: s_and_b32 s0, ttmp6, 15
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT: s_lshl1_add_u32 s0, ttmp9, s0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v2, off
; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
index 497241cff392d..6b6658bd672de 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
@@ -234,19 +234,18 @@ define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" {
;
; GFX1250-SDAG-LABEL: workgroup_id_optimized:
; GFX1250-SDAG: ; %bb.0: ; %.entry
-; GFX1250-SDAG-NEXT: s_lshl_b32 s0, ttmp9, 1
-; GFX1250-SDAG-NEXT: s_and_b32 s1, ttmp6, 15
-; GFX1250-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 14
-; GFX1250-SDAG-NEXT: s_add_co_i32 s1, s1, s0
-; GFX1250-SDAG-NEXT: s_and_b32 s0, s2, 0x3fffc
+; GFX1250-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 14
; GFX1250-SDAG-NEXT: s_and_b32 s2, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT: s_and_b32 s0, ttmp6, 15
+; GFX1250-SDAG-NEXT: s_and_b32 s1, s1, 0x3fffc
; GFX1250-SDAG-NEXT: s_bfe_u32 s3, ttmp6, 0x40008
; GFX1250-SDAG-NEXT: s_mul_i32 s2, s2, 3
; GFX1250-SDAG-NEXT: s_bfe_u32 s4, ttmp6, 0x40004
-; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, s0
+; GFX1250-SDAG-NEXT: s_lshl1_add_u32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, s1
; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s4, s2
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s4
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX1250-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null
; GFX1250-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index 4b5a7c207055a..8dea9e87e140f 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -1620,15 +1620,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_1-NEXT: s_lshl_b32 s4, s16, 2
-; GFX10_1-NEXT: s_lshr_b32 s55, s32, 5
-; GFX10_1-NEXT: s_add_i32 s55, s55, s4
+; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5
+; GFX10_1-NEXT: s_addk_i32 s4, 0x4040
+; GFX10_1-NEXT: s_lshl2_add_u32 s55, s16, s4
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
-; GFX10_1-NEXT: s_addk_i32 s55, 0x4040
+; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use s55, scc
; GFX10_1-NEXT: ;;#ASMEND
@@ -1650,15 +1649,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_3-NEXT: s_lshl_b32 s4, s16, 2
-; GFX10_3-NEXT: s_lshr_b32 s55, s32, 5
-; GFX10_3-NEXT: s_add_i32 s55, s55, s4
+; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5
+; GFX10_3-NEXT: s_addk_i32 s4, 0x4040
+; GFX10_3-NEXT: s_lshl2_add_u32 s55, s16, s4
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
-; GFX10_3-NEXT: s_addk_i32 s55, 0x4040
+; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use s55, scc
; GFX10_3-NEXT: ;;#ASMEND
@@ -1677,15 +1675,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX11-NEXT: s_add_i32 s2, s32, 0x8040
; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: s_add_i32 s1, s32, 64
; GFX11-NEXT: v_writelane_b32 v1, s55, 0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-NEXT: s_add_i32 s1, s32, 64
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-NEXT: s_add_i32 s55, s32, s0
+; GFX11-NEXT: s_add_i32 s1, s32, 0x4040
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_addk_i32 s55, 0x4040
+; GFX11-NEXT: s_lshl2_add_u32 s55, s0, s1
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s55, scc
@@ -1710,16 +1708,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s1
; GFX12-NEXT: v_writelane_b32 v1, s55, 0
-; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_add_co_i32 s1, s32, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_add_co_i32 s55, s32, s0
+; GFX12-NEXT: s_lshl2_add_u32 s55, s0, s1
+; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_addk_co_i32 s55, 0x4000
-; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s55, scc
; GFX12-NEXT: ;;#ASMEND
@@ -1767,11 +1763,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-NEXT: s_lshr_b32 s4, s32, 6
+; GFX900-NEXT: s_addk_i32 s4, 0x4040
; GFX900-NEXT: v_writelane_b32 v1, s55, 0
-; GFX900-NEXT: s_lshl_b32 s4, s16, 2
-; GFX900-NEXT: s_lshr_b32 s55, s32, 6
-; GFX900-NEXT: s_add_i32 s55, s55, s4
-; GFX900-NEXT: s_addk_i32 s55, 0x4040
+; GFX900-NEXT: s_lshl2_add_u32 s55, s16, s4
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: ;;#ASMSTART
@@ -1796,10 +1791,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX942-NEXT: s_add_i32 s1, s32, 0x8040
; GFX942-NEXT: scratch_store_dword off, v1, s1 ; 4-byte Folded Spill
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
-; GFX942-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-NEXT: s_add_i32 s1, s32, 0x4040
; GFX942-NEXT: v_writelane_b32 v1, s55, 0
-; GFX942-NEXT: s_add_i32 s55, s32, s0
-; GFX942-NEXT: s_addk_i32 s55, 0x4040
+; GFX942-NEXT: s_lshl2_add_u32 s55, s0, s1
; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
index 6e2d0f6503a20..7e2bfa666a19f 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -144,7 +144,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
; GFX9: global_load_dword [[VADDR:v[0-9]+]],
-; GFX9: v_lshlrev_b32_e32 [[ADDR:v[0-9]+]], 2, [[VADDR]]
+; GFX9: v_lshl_add_u32 [[ADDR:v[0-9]+]], [[VADDR]], 2, s{{[0-9]+}}
; GFX9-NOT [[ADDR]]
; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32
define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 627f4ada95dba..c1f52173c7451 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -33,11 +33,10 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
-; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
-; MUBUF-NEXT: s_add_i32 s6, s6, s7
+; MUBUF-NEXT: s_lshl2_add_u32 s6, s10, s6
; MUBUF-NEXT: v_mov_b32_e32 v2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -68,10 +67,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2
; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
-; FLATSCR-NEXT: s_add_i32 s2, s2, s3
+; FLATSCR-NEXT: s_lshl2_add_u32 s2, s6, s2
; FLATSCR-NEXT: scratch_load_dword v2, off, s2
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -132,12 +130,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: ; %bb.1: ; %bb.0
; MUBUF-NEXT: s_add_i32 s4, s32, 0xfff
; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000
-; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
; MUBUF-NEXT: v_mov_b32_e32 v3, 1
-; MUBUF-NEXT: s_add_i32 s4, s4, s5
+; MUBUF-NEXT: s_lshl2_add_u32 s4, s5, s4
; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
@@ -168,10 +165,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2
; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
-; FLATSCR-NEXT: s_add_i32 s0, s0, s1
+; FLATSCR-NEXT: s_lshl2_add_u32 s0, s1, s0
; FLATSCR-NEXT: scratch_load_dword v2, off, s0
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/shlN_add.ll
index 3e507a0c5889f..ba8ae9554d0e8 100644
--- a/llvm/test/CodeGen/AMDGPU/shlN_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/shlN_add.ll
@@ -14,8 +14,7 @@
define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
; GFX9-SDAG-LABEL: s_shl1_add_u32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s1
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl1_add_u32:
@@ -26,8 +25,7 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
;
; GFX10-SDAG-LABEL: s_shl1_add_u32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s1
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl1_add_u32:
@@ -53,8 +51,7 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
; GFX9-SDAG-LABEL: s_shl2_add_u32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s1
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl2_add_u32:
@@ -65,8 +62,7 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
;
; GFX10-SDAG-LABEL: s_shl2_add_u32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s1
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl2_add_u32:
@@ -92,8 +88,7 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
; GFX9-SDAG-LABEL: s_shl3_add_u32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s1
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl3_add_u32:
@@ -104,8 +99,7 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
;
; GFX10-SDAG-LABEL: s_shl3_add_u32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 3
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s1
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl3_add_u32:
@@ -131,8 +125,7 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) {
; GFX9-SDAG-LABEL: s_shl4_add_u32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s1
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl4_add_u32:
@@ -143,8 +136,7 @@ define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) {
;
; GFX10-SDAG-LABEL: s_shl4_add_u32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s1
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl4_add_u32:
@@ -598,10 +590,8 @@ define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
; GFX9-SDAG-LABEL: s_shl1_add_u32_v2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 1
-; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: s_lshl1_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s2
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl1_add_u32_v2:
@@ -614,10 +604,8 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
;
; GFX10-SDAG-LABEL: s_shl1_add_u32_v2:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 1
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: s_lshl1_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_lshl1_add_u32 s1, s1, s3
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl1_add_u32_v2:
@@ -647,10 +635,8 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
; GFX9-SDAG-LABEL: s_shl2_add_u32_v2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 2
-; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl2_add_u32_v2:
@@ -663,10 +649,8 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
;
; GFX10-SDAG-LABEL: s_shl2_add_u32_v2:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 2
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_lshl2_add_u32 s1, s1, s3
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl2_add_u32_v2:
@@ -696,10 +680,8 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
; GFX9-SDAG-LABEL: s_shl3_add_u32_v2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 3
-; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 3
-; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: s_lshl3_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s2
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl3_add_u32_v2:
@@ -712,10 +694,8 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
;
; GFX10-SDAG-LABEL: s_shl3_add_u32_v2:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 3
-; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 3
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: s_lshl3_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_lshl3_add_u32 s1, s1, s3
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl3_add_u32_v2:
@@ -745,10 +725,8 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
; GFX9-SDAG-LABEL: s_shl4_add_u32_v2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 4
-; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s2
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl4_add_u32_v2:
@@ -761,10 +739,8 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
;
; GFX10-SDAG-LABEL: s_shl4_add_u32_v2:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 4
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: s_lshl4_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl4_add_u32_v2:
@@ -794,10 +770,8 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
; GFX9-SDAG-LABEL: s_shl_2_4_add_u32_v2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 4
-; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2
; GFX9-SDAG-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_shl_2_4_add_u32_v2:
@@ -810,10 +784,8 @@ define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32
;
; GFX10-SDAG-LABEL: s_shl_2_4_add_u32_v2:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 4
-; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: s_lshl2_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_lshl4_add_u32 s1, s1, s3
; GFX10-SDAG-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl_2_4_add_u32_v2:
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 5aafb0f576fb4..90304b2c730cb 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -69,6 +69,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
@@ -92,7 +93,6 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
@@ -101,6 +101,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
@@ -113,10 +114,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_LSHL4_ADD_U32_:%[0-9]+]]:sreg_32 = S_LSHL4_ADD_U32 [[COPY12]], 16, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_LSHL4_ADD_U32_]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
@@ -127,25 +126,25 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_3]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_3]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_4]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
@@ -164,11 +163,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -185,11 +184,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
@@ -198,32 +197,32 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
- ; CHECK-NEXT: KILL undef %470:sreg_64
+ ; CHECK-NEXT: KILL undef %469:sreg_64
; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
- ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
- ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]]
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
@@ -236,10 +235,10 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
+ ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]]
- ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -310,15 +309,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_35]], [[V_ADD_U32_e64_17]], implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_36]], [[V_ADD_U32_e64_18]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_37]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_38]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_39]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_40]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_41]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_42]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_43]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_6]], [[V_OR_B32_e64_37]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_38]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_39]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_40]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_41]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_42]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_43]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_44]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_44]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_46:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_45]], [[V_ADD_U32_e64_19]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -459, [[BUFFER_LOAD_FORMAT_X_IDXEN21]], 0, implicit $exec
@@ -326,15 +325,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -466, [[BUFFER_LOAD_FORMAT_X_IDXEN22]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_48:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_47]], [[V_ADD_U32_e64_21]], implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_49:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_48]], [[V_ADD_U32_e64_22]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_49]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_50]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_51]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_52]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_53]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_54]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_55]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_56]], implicit $exec
- ; CHECK-NEXT: [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_57]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_49]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_50]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_51]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_52]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_53]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_54]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_55]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_56]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_57]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -555, [[BUFFER_LOAD_FORMAT_X_IDXEN23]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -556, [[BUFFER_LOAD_FORMAT_X_IDXEN24]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_59:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_58]], [[V_ADD_U32_e64_23]], implicit $exec
@@ -351,13 +350,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
- ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
- ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
- ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
+ ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_66]], implicit $exec
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
- ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+ ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
.expVert:
%0 = extractelement <31 x i32> %userData, i64 2
>From de2f09cb46b3852c082e94d32fe83d4113097f13 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Fri, 7 Nov 2025 15:48:11 -0600
Subject: [PATCH 2/3] Generate s_lshl?_add_u32
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 20 +++++++++-----------
1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index eb94b4b22e85f..9701079bb2761 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7905,21 +7905,19 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
+ unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
+ : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
+ : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
+ : 4);
- dbgs() << "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG\n";
- Inst.dump();
- unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1 :
- Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2 :
- Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3 : 4);
-
const TargetRegisterClass *NewRC =
RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
Register DestReg = MRI.createVirtualRegister(NewRC);
- MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
- .add(Src0)
- .addImm(ShiftAmt)
- .add(Src1);
-
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+ .add(Src0)
+ .addImm(ShiftAmt)
+ .add(Src1);
legalizeOperands(*NewInstr, MDT);
MRI.replaceRegWith(Dest.getReg(), DestReg);
>From 7610a5db285f9901c066c4d5890a46af9e4a72ed Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Fri, 7 Nov 2025 16:42:51 -0600
Subject: [PATCH 3/3] Test moving to VALU
Signed-off-by: John Lu <John.Lu at amd.com>
---
.../CodeGen/AMDGPU/move-to-valu-lshl_add.ll | 90 +++++++++++++++++++
1 file changed, 90 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
new file mode 100644
index 0000000000000..b2d9fe667d958
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies < %s | FileCheck %s
+
+define amdgpu_kernel void @lshl1_add(ptr addrspace(5) %alloca) {
+ ; CHECK-LABEL: name: lshl1_add
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 1, killed [[S_LOAD_DWORD_IMM]], implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_SHORT killed [[V_MOV_B32_e32_]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %1 = sext i32 %vaddr to i64
+ %gep = getelementptr i16, ptr addrspace(5) %alloca, i64 %1
+ store i16 0, ptr addrspace(5) %gep, align 2
+ ret void
+}
+
+define amdgpu_kernel void @lshl2_add(ptr addrspace(5) %alloca) {
+ ; CHECK-LABEL: name: lshl2_add
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 2, killed [[S_LOAD_DWORD_IMM]], implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD killed [[V_MOV_B32_e32_]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %1 = sext i32 %vaddr to i64
+ %gep = getelementptr i32, ptr addrspace(5) %alloca, i64 %1
+ store i32 0, ptr addrspace(5) %gep, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lshl3_add(ptr addrspace(5) %alloca) {
+ ; CHECK-LABEL: name: lshl3_add
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 3, killed [[S_LOAD_DWORD_IMM]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: SCRATCH_STORE_DWORDX2 killed [[REG_SEQUENCE]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %1 = sext i32 %vaddr to i64
+ %gep = getelementptr i64, ptr addrspace(5) %alloca, i64 %1
+ store i64 0, ptr addrspace(5) %gep, align 8
+ ret void
+}
+
+define amdgpu_kernel void @lshl4_add(ptr addrspace(5) %alloca) {
+ ; CHECK-LABEL: name: lshl4_add
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.alloca.kernarg.offset, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[V_MOV_B]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from `ptr addrspace(1) null`, addrspace 1)
+ ; CHECK-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], 4, killed [[S_LOAD_DWORD_IMM]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK-NEXT: SCRATCH_STORE_DWORDX4 killed [[REG_SEQUENCE]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %ir.gep, addrspace 5)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %vaddr = load volatile i32, ptr addrspace(1) null, align 4
+ %1 = sext i32 %vaddr to i64
+ %gep = getelementptr i128, ptr addrspace(5) %alloca, i64 %1
+ store i128 0, ptr addrspace(5) %gep, align 16
+ ret void
+}
More information about the llvm-commits
mailing list