[llvm] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by constant (PR #71035)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 3 03:58:16 PDT 2023


https://github.com/sstipanovic updated https://github.com/llvm/llvm-project/pull/71035

>From 7e8fea301e01f953532ce817e53f716617323164 Mon Sep 17 00:00:00 2001
From: Stefan Stipanovic <stefan.stipanovic at syrmia.com>
Date: Fri, 3 Nov 2023 11:49:15 +0100
Subject: [PATCH] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by
 constant

---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |   14 +
 .../AMDGPU/atomic_optimizations_buffer.ll     |   38 +-
 .../atomic_optimizations_global_pointer.ll    |  275 ++--
 .../atomic_optimizations_local_pointer.ll     |  106 +-
 .../atomic_optimizations_pixelshader.ll       |  120 +-
 .../AMDGPU/atomic_optimizations_raw_buffer.ll |   38 +-
 .../atomic_optimizations_struct_buffer.ll     |   34 +-
 .../CodeGen/AMDGPU/frame-index-elimination.ll | 1157 ++++++++++++++---
 llvm/test/CodeGen/AMDGPU/mul.ll               |   45 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |    8 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll    |   14 +-
 llvm/test/CodeGen/AMDGPU/wqm.ll               |    4 +-
 12 files changed, 1388 insertions(+), 465 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c0e0ac1b4ec8873..1cf19da3840c7d0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -515,6 +515,16 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
   let HasExtVOP3DPP = 0;
 }
 
+def IsPow2_32: PatLeaf<(i32 imm), [{
+  uint32_t V = N->getZExtValue();
+  return isPowerOf2_32(V - 1);
+}]>;
+
+def Log2_32: SDNodeXForm<imm, [{
+  uint32_t V = N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_32(V - 1), SDLoc(N), MVT::i32);
+}]>;
+
 let SubtargetPredicate = isGFX9Plus in {
 let isCommutable = 1, isReMaterializable = 1 in {
   defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -612,6 +622,10 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
 def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
 def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
 
+def : GCNPat<
+ (mul i32:$src0, IsPow2_32:$src1),
+ (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
+
 let SubtargetPredicate = isGFX940Plus in
 def : GCNPat<
   (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index f8f50c7cb23a5aa..6ad768953d17537 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -83,8 +83,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:  .LBB0_2:
@@ -110,8 +109,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:  ; %bb.1:
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX10W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX10W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX10W64-NEXT:  .LBB0_2:
@@ -137,8 +135,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:  ; %bb.1:
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX10W32-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX10W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
 ; GFX10W32-NEXT:  .LBB0_2:
@@ -166,9 +163,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:  ; %bb.1:
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX11W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX11W64-NEXT:  .LBB0_2:
@@ -197,9 +193,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:  ; %bb.1:
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX11W32-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
 ; GFX11W32-NEXT:  .LBB0_2:
@@ -1156,8 +1151,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:  .LBB5_2:
@@ -1184,8 +1178,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:  ; %bb.1:
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX10W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX10W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX10W64-NEXT:  .LBB5_2:
@@ -1212,8 +1205,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:  ; %bb.1:
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX10W32-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX10W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[4:7], 0 glc
 ; GFX10W32-NEXT:  .LBB5_2:
@@ -1242,9 +1234,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:  ; %bb.1:
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX11W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
 ; GFX11W64-NEXT:  .LBB5_2:
@@ -1274,9 +1265,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:  ; %bb.1:
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX11W32-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
 ; GFX11W32-NEXT:  .LBB5_2:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 81fd166e3779f83..d535fc0554b7663 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -45,38 +45,70 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: add_i32_constant:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT:    s_mov_b64 s[6:7], exec
-; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX89-NEXT:    ; implicit-def: $vgpr1
-; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX89-NEXT:    s_cbranch_execz .LBB0_2
-; GFX89-NEXT:  ; %bb.1:
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
-; GFX89-NEXT:    s_mul_i32 s2, s2, 5
-; GFX89-NEXT:    s_mov_b32 s11, 0xf000
-; GFX89-NEXT:    s_mov_b32 s10, -1
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    v_mov_b32_e32 v1, s2
-; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_wbinvl1_vol
-; GFX89-NEXT:  .LBB0_2:
-; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_mov_b32 s2, -1
-; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: add_i32_constant:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB0_2
+; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s8, s2
+; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT:    s_mul_i32 s2, s2, 5
+; GFX8-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8-NEXT:    s_mov_b32 s10, -1
+; GFX8-NEXT:    s_mov_b32 s9, s3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:  .LBB0_2:
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: add_i32_constant:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB0_2
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    v_lshl_add_u32 v1, s2, 2, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:  .LBB0_2:
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
@@ -91,9 +123,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1064-NEXT:    v_lshl_add_u32 v1, s6, 2, s6
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s8, s2
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
@@ -126,9 +157,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
+; GFX1032-NEXT:    v_lshl_add_u32 v1, s5, 2, s5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s8, s2
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
@@ -163,9 +193,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1164-NEXT:    v_lshl_add_u32 v1, s6, 2, s6
 ; GFX1164-NEXT:    s_mov_b32 s10, -1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_mov_b32 s8, s2
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
@@ -201,9 +230,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
+; GFX1132-NEXT:    v_lshl_add_u32 v1, s5, 2, s5
 ; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_mov_b32 s8, s2
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
@@ -869,43 +897,80 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: add_i64_constant:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT:    s_mov_b64 s[6:7], exec
-; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
-; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX89-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX89-NEXT:    s_cbranch_execz .LBB3_2
-; GFX89-NEXT:  ; %bb.1:
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
-; GFX89-NEXT:    s_mul_i32 s2, s2, 5
-; GFX89-NEXT:    s_mov_b32 s11, 0xf000
-; GFX89-NEXT:    s_mov_b32 s10, -1
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    v_mov_b32_e32 v0, s2
-; GFX89-NEXT:    v_mov_b32_e32 v1, 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_wbinvl1_vol
-; GFX89-NEXT:  .LBB3_2:
-; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX89-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX89-NEXT:    v_mov_b32_e32 v0, s2
-; GFX89-NEXT:    v_mov_b32_e32 v1, s3
-; GFX89-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_mov_b32 s2, -1
-; GFX89-NEXT:    s_nop 2
-; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: add_i64_constant:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB3_2
+; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s8, s2
+; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT:    s_mul_i32 s2, s2, 5
+; GFX8-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8-NEXT:    s_mov_b32 s10, -1
+; GFX8-NEXT:    s_mov_b32 s9, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:  .LBB3_2:
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_nop 2
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: add_i64_constant:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB3_2
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    v_lshl_add_u32 v0, s2, 2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:  .LBB3_2:
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_nop 2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
@@ -920,9 +985,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1064-NEXT:    v_lshl_add_u32 v0, s6, 2, s6
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s8, s2
@@ -957,9 +1021,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
+; GFX1032-NEXT:    v_lshl_add_u32 v0, s5, 2, s5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s8, s2
@@ -996,9 +1059,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1164-NEXT:    v_lshl_add_u32 v0, s6, 2, s6
 ; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164-NEXT:    s_mov_b32 s10, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_mov_b32 s8, s2
@@ -1035,10 +1097,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132-NEXT:    v_lshl_add_u32 v0, s5, 2, s5
 ; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_mov_b32 s8, s2
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
@@ -1552,28 +1614,27 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-LABEL: sub_i32_constant:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
-; GFX9-NEXT:    s_mul_i32 s2, s2, 5
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s10, -1
 ; GFX9-NEXT:    s_mov_b32 s9, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_lshl_add_u32 v1, s2, 2, s2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB6_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1596,9 +1657,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1064-NEXT:    v_lshl_add_u32 v1, s6, 2, s6
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s8, s2
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
@@ -1632,9 +1692,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
+; GFX1032-NEXT:    v_lshl_add_u32 v1, s5, 2, s5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s8, s2
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
@@ -1670,9 +1729,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1164-NEXT:    v_lshl_add_u32 v1, s6, 2, s6
 ; GFX1164-NEXT:    s_mov_b32 s10, -1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_mov_b32 s8, s2
 ; GFX1164-NEXT:    s_mov_b32 s9, s3
@@ -1709,9 +1767,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
+; GFX1132-NEXT:    v_lshl_add_u32 v1, s5, 2, s5
 ; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_mov_b32 s8, s2
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
@@ -2435,11 +2492,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s8, s2
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
-; GFX9-NEXT:    s_mul_i32 s2, s2, 5
 ; GFX9-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s10, -1
 ; GFX9-NEXT:    s_mov_b32 s9, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_lshl_add_u32 v0, s2, 2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
@@ -2473,9 +2529,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1064-NEXT:    v_lshl_add_u32 v0, s6, 2, s6
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s8, s2
@@ -2513,9 +2568,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
+; GFX1032-NEXT:    v_lshl_add_u32 v0, s5, 2, s5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s8, s2
@@ -2555,9 +2609,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
+; GFX1164-NEXT:    v_lshl_add_u32 v0, s6, 2, s6
 ; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164-NEXT:    s_mov_b32 s10, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_mov_b32 s8, s2
@@ -2597,10 +2650,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132-NEXT:    v_lshl_add_u32 v0, s5, 2, s5
 ; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_mov_b32 s8, s2
 ; GFX1132-NEXT:    s_mov_b32 s9, s3
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index abd9a4159f8ccd9..fcba00d3f04fd13 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -86,11 +86,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
+; GFX9-NEXT:    ds_add_rtn_u32 v1, v2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB0_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -114,12 +113,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
+; GFX1064-NEXT:    ds_add_rtn_u32 v1, v2, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB0_2:
@@ -144,12 +142,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_add_rtn_u32 v1, v1, v2
+; GFX1032-NEXT:    ds_add_rtn_u32 v1, v2, v1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB0_2:
@@ -176,13 +173,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_add_rtn_u32 v1, v1, v2
+; GFX1164-NEXT:    ds_add_rtn_u32 v1, v2, v1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB0_2:
@@ -210,12 +205,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_add_rtn_u32 v1, v1, v2
+; GFX1132-NEXT:    ds_add_rtn_u32 v1, v2, v1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB0_2:
@@ -1069,8 +1063,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_lshl_add_u32 v0, s4, 2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
@@ -1103,8 +1096,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064-NEXT:    v_lshl_add_u32 v0, s4, 2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
@@ -1134,8 +1126,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032-NEXT:    v_lshl_add_u32 v0, s3, 2, s3
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
@@ -1167,9 +1158,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164-NEXT:    v_lshl_add_u32 v0, s4, 2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1164-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
@@ -1202,9 +1191,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1132-NEXT:    v_lshl_add_u32 v0, s3, 2, s3
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1132-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
@@ -1671,11 +1658,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
+; GFX9-NEXT:    ds_sub_rtn_u32 v1, v2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB7_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
@@ -1700,12 +1686,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
+; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v2, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB7_2:
@@ -1731,12 +1716,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v1, v2
+; GFX1032-NEXT:    ds_sub_rtn_u32 v1, v2, v1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB7_2:
@@ -1764,13 +1748,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v1, v2
+; GFX1164-NEXT:    ds_sub_rtn_u32 v1, v2, v1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB7_2:
@@ -1799,12 +1781,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v1, v2
+; GFX1132-NEXT:    ds_sub_rtn_u32 v1, v2, v1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB7_2:
@@ -2663,8 +2644,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_lshl_add_u32 v0, s4, 2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
@@ -2697,8 +2677,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064-NEXT:    v_lshl_add_u32 v0, s4, 2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
@@ -2731,8 +2710,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032-NEXT:    v_lshl_add_u32 v0, s3, 2, s3
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
@@ -2767,9 +2745,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_mul_i32 s4, s4, 5
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164-NEXT:    v_lshl_add_u32 v0, s4, 2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1164-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
@@ -2805,9 +2781,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_mul_i32 s3, s3, 5
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1132-NEXT:    v_lshl_add_u32 v0, s3, 2, s3
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1132-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 1ebd864e7e03aa9..bbc5acd2d1069c3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -49,40 +49,74 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX7-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
 ; GFX7-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: add_i32_constant:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_mov_b64 s[10:11], exec
-; GFX89-NEXT:    ; implicit-def: $vgpr0
-; GFX89-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
-; GFX89-NEXT:    s_cbranch_execz .LBB0_4
-; GFX89-NEXT:  ; %bb.1:
-; GFX89-NEXT:    s_mov_b64 s[12:13], exec
-; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
-; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
-; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX89-NEXT:    ; implicit-def: $vgpr1
-; GFX89-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GFX89-NEXT:    s_cbranch_execz .LBB0_3
-; GFX89-NEXT:  ; %bb.2:
-; GFX89-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
-; GFX89-NEXT:    s_mul_i32 s12, s12, 5
-; GFX89-NEXT:    v_mov_b32_e32 v1, s12
-; GFX89-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
-; GFX89-NEXT:  .LBB0_3:
-; GFX89-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT:  .LBB0_4: ; %Flow
-; GFX89-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX89-NEXT:    s_wqm_b64 s[4:5], -1
-; GFX89-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX89-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GFX89-NEXT:    s_cbranch_vccnz .LBB0_6
-; GFX89-NEXT:  ; %bb.5: ; %if
-; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX89-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: add_i32_constant:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_mov_b64 s[10:11], exec
+; GFX8-NEXT:    ; implicit-def: $vgpr0
+; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
+; GFX8-NEXT:    s_cbranch_execz .LBB0_4
+; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB0_3
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
+; GFX8-NEXT:    s_mul_i32 s12, s12, 5
+; GFX8-NEXT:    v_mov_b32_e32 v1, s12
+; GFX8-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
+; GFX8-NEXT:  .LBB0_3:
+; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT:  .LBB0_4: ; %Flow
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
+; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
+; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_vccnz .LBB0_6
+; GFX8-NEXT:  ; %bb.5: ; %if
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: add_i32_constant:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_mov_b64 s[10:11], exec
+; GFX9-NEXT:    ; implicit-def: $vgpr0
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
+; GFX9-NEXT:    s_cbranch_execz .LBB0_4
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB0_3
+; GFX9-NEXT:  ; %bb.2:
+; GFX9-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
+; GFX9-NEXT:    v_lshl_add_u32 v1, s12, 2, s12
+; GFX9-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
+; GFX9-NEXT:  .LBB0_3:
+; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT:  .LBB0_4: ; %Flow
+; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_vccnz .LBB0_6
+; GFX9-NEXT:  ; %bb.5: ; %if
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
@@ -100,8 +134,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1064-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1064-NEXT:  ; %bb.2:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
-; GFX1064-NEXT:    s_mul_i32 s12, s12, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s12
+; GFX1064-NEXT:    v_lshl_add_u32 v1, s12, 2, s12
 ; GFX1064-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
 ; GFX1064-NEXT:  .LBB0_3:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
@@ -135,8 +168,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1032-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1032-NEXT:  ; %bb.2:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s10, s10
-; GFX1032-NEXT:    s_mul_i32 s10, s10, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s10
+; GFX1032-NEXT:    v_lshl_add_u32 v1, s10, 2, s10
 ; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
 ; GFX1032-NEXT:  .LBB0_3:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
@@ -173,9 +205,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1164-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1164-NEXT:  ; %bb.2:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_mul_i32 s12, s12, 5
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_lshl_add_u32 v1, s12, 2, s12
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
 ; GFX1164-NEXT:  .LBB0_3:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[10:11]
@@ -214,9 +245,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1132-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1132-NEXT:  ; %bb.2:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s10, s10
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_mul_i32 s10, s10, 5
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s10
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_lshl_add_u32 v1, s10, 2, s10
 ; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
 ; GFX1132-NEXT:  .LBB0_3:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
@@ -650,3 +680,5 @@ if:
 else:
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX89: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index baa0c72dbf63e2d..6ff45b05f005ed3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -82,8 +82,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:  .LBB0_2:
@@ -109,8 +108,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:  ; %bb.1:
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX10W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX10W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX10W64-NEXT:  .LBB0_2:
@@ -136,8 +134,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:  ; %bb.1:
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX10W32-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX10W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
 ; GFX10W32-NEXT:  .LBB0_2:
@@ -165,9 +162,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:  ; %bb.1:
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX11W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX11W64-NEXT:  .LBB0_2:
@@ -196,9 +192,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:  ; %bb.1:
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX11W32-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
 ; GFX11W32-NEXT:  .LBB0_2:
@@ -858,8 +853,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:  .LBB4_2:
@@ -886,8 +880,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:  ; %bb.1:
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX10W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX10W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX10W64-NEXT:  .LBB4_2:
@@ -914,8 +907,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:  ; %bb.1:
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX10W32-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX10W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[4:7], 0 glc
 ; GFX10W32-NEXT:  .LBB4_2:
@@ -944,9 +936,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:  ; %bb.1:
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX11W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
 ; GFX11W64-NEXT:  .LBB4_2:
@@ -976,9 +967,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:  ; %bb.1:
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX11W32-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
 ; GFX11W32-NEXT:  .LBB4_2:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 37d421b01797945..681f30e64d80e51 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -84,8 +84,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
@@ -113,8 +112,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX10W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W64-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
 ; GFX10W64-NEXT:  .LBB0_2:
@@ -141,8 +139,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX10W32-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    buffer_atomic_add v1, v2, s[4:7], 0 idxen glc
 ; GFX10W32-NEXT:  .LBB0_2:
@@ -171,9 +168,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX11W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
 ; GFX11W64-NEXT:  .LBB0_2:
@@ -203,9 +198,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX11W32-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc
 ; GFX11W32-NEXT:  .LBB0_2:
@@ -985,8 +978,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
@@ -1015,8 +1007,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX10W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W64-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
 ; GFX10W64-NEXT:  .LBB5_2:
@@ -1044,8 +1035,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX10W32-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    buffer_atomic_sub v1, v2, s[4:7], 0 idxen glc
 ; GFX10W32-NEXT:  .LBB5_2:
@@ -1075,9 +1065,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX11W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11W64-NEXT:    s_mul_i32 s4, s4, 5
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT:    v_lshl_add_u32 v1, s4, 2, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
 ; GFX11W64-NEXT:  .LBB5_2:
@@ -1108,9 +1096,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX11W32-NEXT:    s_bcnt1_i32_b32 s3, s3
 ; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11W32-NEXT:    s_mul_i32 s3, s3, 5
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11W32-NEXT:    v_lshl_add_u32 v1, s3, 2, s3
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc
 ; GFX11W32-NEXT:  .LBB5_2:
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 028f32844576f6d..64bff1194c3aa19 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
@@ -7,44 +8,91 @@
 ; give an index relative to the scratch wave offset register
 
 ; Materialize into a mov. Make sure there isn't an unnecessary copy.
-; GCN-LABEL: {{^}}func_mov_fi_i32:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 
-; CI-NEXT: v_lshr_b32_e64 v0, s32, 6
-; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-
-; GFX9-FLATSCR:     v_mov_b32_e32 v0, s32
-; GFX9-FLATSCR-NOT: v_lshrrev_b32_e64
-
-; MUBUF-NOT: v_mov
-
-; GCN: ds_write_b32 v0, v0
 define void @func_mov_fi_i32() #0 {
+; CI-LABEL: func_mov_fi_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_mov_fi_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_mov_fi_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_mov_fi_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile ptr addrspace(5) %alloca, ptr addrspace(3) undef
   ret void
 }
 
 ; Offset due to different objects
-; GCN-LABEL: {{^}}func_mov_fi_i32_offset:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-
-; CI-DAG: v_lshr_b32_e64 v0, s32, 6
-; CI-NOT: v_mov
-; CI: ds_write_b32 v0, v0
-; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
-; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]]
-; CI-NEXT: ds_write_b32 v0, v0
-
-; GFX9-MUBUF-NEXT:   v_lshrrev_b32_e64 v0, 6, s32
-; GFX9-FLATSCR:      v_mov_b32_e32 v0, s32
-; GFX9-FLATSCR:      s_add_i32 [[ADD:[^,]+]], s32, 4
-; GFX9-NEXT:         ds_write_b32 v0, v0
-; GFX9-MUBUF-NEXT:   v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-MUBUF-NEXT:   v_add_u32_e32 v0, 4, [[SCALED]]
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, [[ADD]]
-; GFX9-NEXT:         ds_write_b32 v0, v0
 define void @func_mov_fi_i32_offset() #0 {
+; CI-LABEL: func_mov_fi_i32_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_mov_fi_i32_offset:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_mov_fi_i32_offset:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    s_add_i32 s0, s32, 4
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_mov_fi_i32_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s0, s32, 4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    ds_store_b32 v0, v1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca i32, addrspace(5)
   %alloca1 = alloca i32, addrspace(5)
   store volatile ptr addrspace(5) %alloca0, ptr addrspace(3) undef
@@ -55,21 +103,42 @@ define void @func_mov_fi_i32_offset() #0 {
 ; Materialize into an add of a constant offset from the FI.
 ; FIXME: Should be able to merge adds
 
-; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-
-; CI: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
-; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]
-
-; GFX9-MUBUF:       v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-MUBUF-NEXT:  v_add_u32_e32 v0, 4, [[SCALED]]
-
-; GFX9-FLATSCR:      v_mov_b32_e32 [[ADD:v[0-9]+]], s32
-; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 4, [[ADD]]
-
-; GCN-NOT: v_mov
-; GCN: ds_write_b32 v0, v0
 define void @func_add_constant_to_fi_i32() #0 {
+; CI-LABEL: func_add_constant_to_fi_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_add_constant_to_fi_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_add_constant_to_fi_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_add_constant_to_fi_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e64 v0, 4, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [2 x i32], align 4, addrspace(5)
   %gep0 = getelementptr inbounds [2 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
   store volatile ptr addrspace(5) %gep0, ptr addrspace(3) undef
@@ -81,17 +150,42 @@ define void @func_add_constant_to_fi_i32() #0 {
 ; FIXME: Should use s_mul but the frame index always gets materialized into a
 ; vgpr
 
-; GCN-LABEL: {{^}}func_other_fi_user_i32:
-
-; CI: v_lshr_b32_e64 v0, s32, 6
-
-; GFX9-MUBUF:   v_lshrrev_b32_e64 v0, 6, s32
-; GFX9-FLATSCR: v_mov_b32_e32 v0, s32
-
-; GCN-NEXT: v_mul_lo_u32 v0, v0, 9
-; GCN-NOT: v_mov
-; GCN: ds_write_b32 v0, v0
 define void @func_other_fi_user_i32() #0 {
+; CI-LABEL: func_other_fi_user_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_other_fi_user_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_other_fi_user_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_other_fi_user_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshl_add_u32 v0, s32, 3, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [2 x i32], align 4, addrspace(5)
   %ptrtoint = ptrtoint ptr addrspace(5) %alloca to i32
   %mul = mul i32 %ptrtoint, 9
@@ -99,39 +193,110 @@ define void @func_other_fi_user_i32() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
-; GCN: v_mov_b32_e32 v1, 15{{$}}
-; MUBUF:        buffer_store_dword v1, v0, s[0:3], 0 offen{{$}}
-; GFX9-FLATSCR: scratch_store_dword v0, v1, off{{$}}
 define void @func_store_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 {
+; CI-LABEL: func_store_private_arg_i32_ptr:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v1, 15
+; CI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_store_private_arg_i32_ptr:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_store_private_arg_i32_ptr:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-FLATSCR-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_store_private_arg_i32_ptr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile i32 15, ptr addrspace(5) %ptr
   ret void
 }
 
-; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
-; GCN: s_waitcnt
-; MUBUF-NEXT:        buffer_load_dword v0, v0, s[0:3], 0 offen glc{{$}}
-; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off glc{{$}}
 define void @func_load_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 {
+; CI-LABEL: func_load_private_arg_i32_ptr:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_load_private_arg_i32_ptr:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_load_private_arg_i32_ptr:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_load_private_arg_i32_ptr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i32, ptr addrspace(5) %ptr
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
-; GCN: s_waitcnt
-
-; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
-; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
-
-; GFX9-MUBUF:      v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
-; GFX9-MUBUF-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
-
-; GFX9-FLATSCR:      v_mov_b32_e32 [[SP:v[0-9]+]], s32
-; GFX9-FLATSCR-NEXT: v_or_b32_e32 v0, 4, [[SP]]
-
-; GCN-NOT: v_mov
-; GCN: ds_write_b32 v0, v0
 define void @void_func_byval_struct_i8_i32_ptr(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
+; CI-LABEL: void_func_byval_struct_i8_i32_ptr:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_or_b32_e32 v0, 4, v0
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_or_b32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    v_or_b32_e32 v0, 4, v0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_byval_struct_i8_i32_ptr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_or_b32_e64 v0, 4, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
   %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1
   %load1 = load i32, ptr addrspace(5) %gep1
@@ -139,13 +304,57 @@ define void @void_func_byval_struct_i8_i32_ptr(ptr addrspace(5) byval({ i8, i32
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT: buffer_load_ubyte v0, off, s[0:3], s32
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX9-FLATSCR-NEXT: scratch_load_ubyte v0, off, s32
-; GFX9-FLATSCR-NEXT: scratch_load_dword v1, off, s32 offset:4
+
 define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
+; CI-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    ds_write_b8 v0, v0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b32 v0, v1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; GFX9-MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-MUBUF-NEXT:    ds_write_b8 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v1
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_load_ubyte v0, off, s32
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v1, off, s32 offset:4
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    ds_write_b8 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v1
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_byval_struct_i8_i32_ptr_value:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_u8 v0, off, s32
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    ds_store_b8 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ds_store_b32 v0, v1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
   %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1
   %load0 = load i8, ptr addrspace(5) %gep0
@@ -155,24 +364,75 @@ define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
-
-; GCN: s_and_saveexec_b64
 
-; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}
-; GFX9-MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}
-; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}}
-
-; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
-; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
-
-; GFX9-MUBUF:   v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32
-; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32
-
-; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
-
-; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block(ptr addrspace(5) byval({ i8, i32 }) %arg0, i32 %arg2) #0 {
+; CI-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; CI-NEXT:    s_cbranch_execz .LBB8_2
+; CI-NEXT:  ; %bb.1: ; %bb
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:  .LBB8_2: ; %ret
+; CI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-MUBUF-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9-MUBUF-NEXT:  ; %bb.1: ; %bb
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:  .LBB8_2: ; %ret
+; GFX9-MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:  .LBB8_2: ; %ret
+; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT:    s_cbranch_execz .LBB8_2
+; GFX11-NEXT:  ; %bb.1: ; %bb
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e64 v0, 4, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:  .LBB8_2: ; %ret
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %bb, label %ret
 
@@ -187,22 +447,60 @@ ret:
   ret void
 }
 
-; Added offset can't be used with VOP3 add
-; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32:
-
-; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200
-; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
-; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]]
-
-; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-MUBUF:     v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
-
-; GFX9-FLATSCR-DAG: s_add_i32 [[SZ:[^,]+]], s32, 0x200
-; GFX9-FLATSCR:     v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]]
-
-; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9
-; GCN: ds_write_b32 v0, [[VZ]]
 define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
+; CI-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    s_movk_i32 vcc_lo, 0x200
+; CI-NEXT:    v_add_i32_e32 v0, vcc, vcc_lo, v0
+; CI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; CI-NEXT:    v_mov_b32_e32 v1, 7
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 0x200, v0
+; GFX9-MUBUF-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-FLATSCR-NEXT:    s_add_i32 s0, s32, 0x200
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:260
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_other_fi_user_non_inline_imm_offset_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x200
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x200
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    v_lshl_add_u32 v1, s0, 3, s1
+; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:260 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ds_store_b32 v0, v1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
   %alloca1 = alloca [8 x i32], align 4, addrspace(5)
   %gep0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca0, i32 0, i32 65
@@ -213,21 +511,84 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
-
-; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200
-; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
-; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
-
-; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-MUBUF:     v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
-
-; GFX9-FLATSCR-DAG: s_add_i32 [[SZ:[^,]+]], s32, 0x200
-; GFX9-FLATSCR:     v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]]
-
-; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9
-; GCN: ds_write_b32 v0, [[VZ]]
 define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 {
+; CI-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, 7
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; def vcc
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    s_movk_i32 s4, 0x200
+; CI-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
+; CI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ;;#ASMSTART
+; CI-NEXT:    ; use vcc
+; CI-NEXT:    ;;#ASMEND
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; def vcc
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 0x200, v0
+; GFX9-MUBUF-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-MUBUF-NEXT:    ;;#ASMSTART
+; GFX9-MUBUF-NEXT:    ; use vcc
+; GFX9-MUBUF-NEXT:    ;;#ASMEND
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-FLATSCR-NEXT:    s_add_i32 s0, s32, 0x200
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; def vcc
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:260
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX9-FLATSCR-NEXT:    ; use vcc
+; GFX9-FLATSCR-NEXT:    ;;#ASMEND
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x200
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x200
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    v_lshl_add_u32 v1, s0, 3, s1
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; def vcc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:260 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use vcc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ds_store_b32 v0, v1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
   %alloca1 = alloca [8 x i32], align 4, addrspace(5)
   %vcc = call i64 asm sideeffect "; def $0", "={vcc}"()
@@ -245,17 +606,392 @@ declare void @func(ptr addrspace(5) nocapture) #0
 ; undef flag not preserved in eliminateFrameIndex when handling the
 ; stores in the middle block.
 
-; GCN-LABEL: {{^}}undefined_stack_store_reg:
-; GCN: s_and_saveexec_b64
-; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset:
-; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset:
-; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset:
-; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:
-; FLATSCR: scratch_store_dword v0, off, s33 offset:
-; FLATSCR: scratch_store_dword v0, off, s33 offset:
-; FLATSCR: scratch_store_dword v0, off, s33 offset:
-; FLATSCR: scratch_store_dword v{{[0-9]+}}, off, s33 offset:
 define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 {
+; CI-LABEL: undefined_stack_store_reg:
+; CI:       ; %bb.0: ; %bb
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s16, s33
+; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; CI-NEXT:    s_mov_b64 exec, s[18:19]
+; CI-NEXT:    v_writelane_b32 v40, s16, 18
+; CI-NEXT:    v_writelane_b32 v40, s30, 0
+; CI-NEXT:    v_writelane_b32 v40, s31, 1
+; CI-NEXT:    v_writelane_b32 v40, s34, 2
+; CI-NEXT:    v_writelane_b32 v40, s35, 3
+; CI-NEXT:    v_writelane_b32 v40, s36, 4
+; CI-NEXT:    v_writelane_b32 v40, s37, 5
+; CI-NEXT:    v_writelane_b32 v40, s38, 6
+; CI-NEXT:    v_writelane_b32 v40, s39, 7
+; CI-NEXT:    v_writelane_b32 v40, s40, 8
+; CI-NEXT:    v_writelane_b32 v40, s41, 9
+; CI-NEXT:    v_writelane_b32 v40, s42, 10
+; CI-NEXT:    v_writelane_b32 v40, s43, 11
+; CI-NEXT:    v_writelane_b32 v40, s44, 12
+; CI-NEXT:    v_writelane_b32 v40, s45, 13
+; CI-NEXT:    v_writelane_b32 v40, s46, 14
+; CI-NEXT:    v_writelane_b32 v40, s47, 15
+; CI-NEXT:    v_writelane_b32 v40, s48, 16
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    v_writelane_b32 v40, s49, 17
+; CI-NEXT:    v_mov_b32_e32 v41, v0
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    s_addk_i32 s32, 0xc00
+; CI-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
+; CI-NEXT:    s_and_saveexec_b64 s[46:47], vcc
+; CI-NEXT:    s_cbranch_execz .LBB11_2
+; CI-NEXT:  ; %bb.1: ; %bb4
+; CI-NEXT:    s_getpc_b64 s[16:17]
+; CI-NEXT:    s_add_u32 s16, s16, func at gotpcrel32@lo+4
+; CI-NEXT:    s_addc_u32 s17, s17, func at gotpcrel32@hi+12
+; CI-NEXT:    s_load_dwordx2 s[48:49], s[16:17], 0x0
+; CI-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; CI-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; CI-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; CI-NEXT:    s_mov_b64 s[40:41], s[10:11]
+; CI-NEXT:    s_mov_b32 s42, s12
+; CI-NEXT:    s_mov_b32 s43, s13
+; CI-NEXT:    s_mov_b32 s44, s14
+; CI-NEXT:    s_mov_b32 s45, s15
+; CI-NEXT:    v_mov_b32_e32 v42, v31
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:28
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:24
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:20
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16
+; CI-NEXT:    v_lshr_b32_e64 v0, s33, 6
+; CI-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; CI-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; CI-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CI-NEXT:    s_mov_b64 s[10:11], s[40:41]
+; CI-NEXT:    s_mov_b32 s12, s42
+; CI-NEXT:    s_mov_b32 s13, s43
+; CI-NEXT:    s_mov_b32 s14, s44
+; CI-NEXT:    s_mov_b32 s15, s45
+; CI-NEXT:    v_mov_b32_e32 v31, v42
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
+; CI-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; CI-NEXT:  .LBB11_2: ; %bb5
+; CI-NEXT:    s_or_b64 exec, exec, s[46:47]
+; CI-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CI-NEXT:    v_readlane_b32 s49, v40, 17
+; CI-NEXT:    v_readlane_b32 s48, v40, 16
+; CI-NEXT:    v_readlane_b32 s47, v40, 15
+; CI-NEXT:    v_readlane_b32 s46, v40, 14
+; CI-NEXT:    v_readlane_b32 s45, v40, 13
+; CI-NEXT:    v_readlane_b32 s44, v40, 12
+; CI-NEXT:    v_readlane_b32 s43, v40, 11
+; CI-NEXT:    v_readlane_b32 s42, v40, 10
+; CI-NEXT:    v_readlane_b32 s41, v40, 9
+; CI-NEXT:    v_readlane_b32 s40, v40, 8
+; CI-NEXT:    v_readlane_b32 s39, v40, 7
+; CI-NEXT:    v_readlane_b32 s38, v40, 6
+; CI-NEXT:    v_readlane_b32 s37, v40, 5
+; CI-NEXT:    v_readlane_b32 s36, v40, 4
+; CI-NEXT:    v_readlane_b32 s35, v40, 3
+; CI-NEXT:    v_readlane_b32 s34, v40, 2
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
+; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s4, v40, 18
+; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; CI-NEXT:    s_mov_b64 exec, s[6:7]
+; CI-NEXT:    s_addk_i32 s32, 0xf400
+; CI-NEXT:    s_mov_b32 s33, s4
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: undefined_stack_store_reg:
+; GFX9-MUBUF:       ; %bb.0: ; %bb
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_mov_b32 s16, s33
+; GFX9-MUBUF-NEXT:    s_mov_b32 s33, s32
+; GFX9-MUBUF-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-MUBUF-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s16, 18
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s36, 4
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s37, 5
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s38, 6
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s39, 7
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s40, 8
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s41, 9
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s42, 10
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s43, 11
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s44, 12
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s45, 13
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s46, 14
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s47, 15
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s48, 16
+; GFX9-MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-MUBUF-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-MUBUF-NEXT:    v_writelane_b32 v40, s49, 17
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-MUBUF-NEXT:    s_addk_i32 s32, 0xc00
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_and_saveexec_b64 s[46:47], vcc
+; GFX9-MUBUF-NEXT:    s_cbranch_execz .LBB11_2
+; GFX9-MUBUF-NEXT:  ; %bb.1: ; %bb4
+; GFX9-MUBUF-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-MUBUF-NEXT:    s_add_u32 s16, s16, func at gotpcrel32@lo+4
+; GFX9-MUBUF-NEXT:    s_addc_u32 s17, s17, func at gotpcrel32@hi+12
+; GFX9-MUBUF-NEXT:    s_load_dwordx2 s[48:49], s[16:17], 0x0
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[40:41], s[10:11]
+; GFX9-MUBUF-NEXT:    s_mov_b32 s42, s12
+; GFX9-MUBUF-NEXT:    s_mov_b32 s43, s13
+; GFX9-MUBUF-NEXT:    s_mov_b32 s44, s14
+; GFX9-MUBUF-NEXT:    s_mov_b32 s45, s15
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v42, v31
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:28
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:24
+; GFX9-MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:20
+; GFX9-MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; GFX9-MUBUF-NEXT:    s_mov_b64 s[10:11], s[40:41]
+; GFX9-MUBUF-NEXT:    s_mov_b32 s12, s42
+; GFX9-MUBUF-NEXT:    s_mov_b32 s13, s43
+; GFX9-MUBUF-NEXT:    s_mov_b32 s14, s44
+; GFX9-MUBUF-NEXT:    s_mov_b32 s15, s45
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v31, v42
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v0, 16, v0
+; GFX9-MUBUF-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX9-MUBUF-NEXT:  .LBB11_2: ; %bb5
+; GFX9-MUBUF-NEXT:    s_or_b64 exec, exec, s[46:47]
+; GFX9-MUBUF-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s49, v40, 17
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s48, v40, 16
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s47, v40, 15
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s46, v40, 14
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s45, v40, 13
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s44, v40, 12
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s43, v40, 11
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s42, v40, 10
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s41, v40, 9
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s39, v40, 7
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s38, v40, 6
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s37, v40, 5
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s36, v40, 4
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-MUBUF-NEXT:    v_readlane_b32 s4, v40, 18
+; GFX9-MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX9-MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-MUBUF-NEXT:    s_addk_i32 s32, 0xf400
+; GFX9-MUBUF-NEXT:    s_mov_b32 s33, s4
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: undefined_stack_store_reg:
+; GFX9-FLATSCR:       ; %bb.0: ; %bb
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, s33
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s33, s32
+; GFX9-FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v40, s33 offset:32 ; 4-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s0, 18
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s36, 4
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s37, 5
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s38, 6
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s39, 7
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s40, 8
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s41, 9
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s42, 10
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s43, 11
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s44, 12
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s45, 13
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s46, 14
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s47, 15
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s48, 16
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    v_writelane_b32 v40, s49, 17
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-FLATSCR-NEXT:    s_add_i32 s32, s32, 48
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[41:44], s0
+; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[46:47], vcc
+; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB11_2
+; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb4
+; GFX9-FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-NEXT:    s_add_u32 s0, s0, func at gotpcrel32@lo+4
+; GFX9-FLATSCR-NEXT:    s_addc_u32 s1, s1, func at gotpcrel32@hi+12
+; GFX9-FLATSCR-NEXT:    s_load_dwordx2 s[48:49], s[0:1], 0x0
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[40:41], s[10:11]
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s42, s12
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s43, s13
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s44, s14
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s45, s15
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v42, v31
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX9-FLATSCR-NEXT:    s_add_i32 s0, s33, 16
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; GFX9-FLATSCR-NEXT:    s_mov_b64 s[10:11], s[40:41]
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s12, s42
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s13, s43
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s14, s44
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s15, s45
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v31, v42
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[41:44], s33 offset:16
+; GFX9-FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX9-FLATSCR-NEXT:  .LBB11_2: ; %bb5
+; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[46:47]
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v42, off, s33 ; 4-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s49, v40, 17
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s48, v40, 16
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s47, v40, 15
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s46, v40, 14
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s45, v40, 13
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s44, v40, 12
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s43, v40, 11
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s42, v40, 10
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s41, v40, 9
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s39, v40, 7
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s38, v40, 6
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s37, v40, 5
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s36, v40, 4
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-FLATSCR-NEXT:    v_readlane_b32 s0, v40, 18
+; GFX9-FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:32 ; 4-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-FLATSCR-NEXT:    s_addk_i32 s32, 0xffd0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s33, s0
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: undefined_stack_store_reg:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:32 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 17
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    v_mov_b32_e32 v41, v0
+; GFX11-NEXT:    s_add_i32 s32, s32, 48
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
+; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
+; GFX11-NEXT:    v_writelane_b32 v40, s40, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s41, 9
+; GFX11-NEXT:    v_writelane_b32 v40, s42, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s43, 11
+; GFX11-NEXT:    v_writelane_b32 v40, s44, 12
+; GFX11-NEXT:    v_writelane_b32 v40, s45, 13
+; GFX11-NEXT:    v_writelane_b32 v40, s46, 14
+; GFX11-NEXT:    s_mov_b32 s46, exec_lo
+; GFX11-NEXT:    v_writelane_b32 v40, s48, 15
+; GFX11-NEXT:    v_writelane_b32 v40, s49, 16
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX11-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-NEXT:  ; %bb.1: ; %bb4
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, func at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, func at gotpcrel32@hi+12
+; GFX11-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX11-NEXT:    s_load_b64 s[48:49], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GFX11-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GFX11-NEXT:    s_mov_b64 s[40:41], s[10:11]
+; GFX11-NEXT:    s_mov_b32 s42, s12
+; GFX11-NEXT:    v_mov_b32_e32 v42, v31
+; GFX11-NEXT:    s_mov_b32 s43, s13
+; GFX11-NEXT:    s_mov_b32 s44, s14
+; GFX11-NEXT:    s_mov_b32 s45, s15
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX11-NEXT:    s_add_i32 s0, s33, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT:    s_mov_b64 s[4:5], s[34:35]
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GFX11-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; GFX11-NEXT:    s_mov_b64 s[10:11], s[40:41]
+; GFX11-NEXT:    s_mov_b32 s12, s42
+; GFX11-NEXT:    s_mov_b32 s13, s43
+; GFX11-NEXT:    s_mov_b32 s14, s44
+; GFX11-NEXT:    s_mov_b32 s15, s45
+; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s33 offset:16
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; GFX11-NEXT:  .LBB11_2: ; %bb5
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s46
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    v_readlane_b32 s49, v40, 16
+; GFX11-NEXT:    v_readlane_b32 s48, v40, 15
+; GFX11-NEXT:    v_readlane_b32 s46, v40, 14
+; GFX11-NEXT:    v_readlane_b32 s45, v40, 13
+; GFX11-NEXT:    v_readlane_b32 s44, v40, 12
+; GFX11-NEXT:    v_readlane_b32 s43, v40, 11
+; GFX11-NEXT:    v_readlane_b32 s42, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s41, v40, 9
+; GFX11-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 17
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:32 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_addk_i32 s32, 0xffd0
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = alloca <4 x float>, align 16, addrspace(5)
   %tmp2 = insertelement <4 x float> undef, float %arg, i32 0
@@ -273,22 +1009,74 @@ bb5:
   ret void
 }
 
-; GCN-LABEL: {{^}}alloca_ptr_nonentry_block:
-; GCN: s_and_saveexec_b64
-; MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
-; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4
-
-; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
-; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
-
-; GFX9-MUBUF: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
-; GFX9-MUBUF-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
-
-; GFX9-FLATSCR:      v_mov_b32_e32 [[SP:v[0-9]+]], s32
-; GFX9-FLATSCR-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SP]]
-
-; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]]
 define void @alloca_ptr_nonentry_block(i32 %arg0) #0 {
+; CI-LABEL: alloca_ptr_nonentry_block:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; CI-NEXT:    s_cbranch_execz .LBB12_2
+; CI-NEXT:  ; %bb.1: ; %bb
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; CI-NEXT:    v_or_b32_e32 v0, 4, v0
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:  .LBB12_2: ; %ret
+; CI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-MUBUF-LABEL: alloca_ptr_nonentry_block:
+; GFX9-MUBUF:       ; %bb.0:
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-MUBUF-NEXT:    s_cbranch_execz .LBB12_2
+; GFX9-MUBUF-NEXT:  ; %bb.1: ; %bb
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT:    v_or_b32_e32 v0, 4, v0
+; GFX9-MUBUF-NEXT:    ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:  .LBB12_2: ; %ret
+; GFX9-MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: alloca_ptr_nonentry_block:
+; GFX9-FLATSCR:       ; %bb.0:
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB12_2
+; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb
+; GFX9-FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NEXT:    v_or_b32_e32 v0, 4, v0
+; GFX9-FLATSCR-NEXT:    ds_write_b32 v0, v0
+; GFX9-FLATSCR-NEXT:  .LBB12_2: ; %ret
+; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: alloca_ptr_nonentry_block:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-NEXT:  ; %bb.1: ; %bb
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_or_b32_e64 v0, 4, s32
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:  .LBB12_2: ; %ret
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %alloca0 = alloca { i8, i32 }, align 8, addrspace(5)
   %cmp = icmp eq i32 %arg0, 0
   br i1 %cmp, label %bb, label %ret
@@ -308,14 +1096,67 @@ ret:
 %type.i16 = type { i16 }
 @_ZZN0 = external hidden addrspace(3) global %struct0, align 8
 
+define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) {
+; CI-LABEL: tied_operand_test:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_add_u32 s0, s0, s9
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4
+; CI-NEXT:    s_load_dword s4, s[4:5], 0x1
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_lshl_b32 s4, s4, 1
+; CI-NEXT:    v_mov_b32_e32 v2, s4
+; CI-NEXT:    ds_write_b16 v2, v1 offset:8
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b16 v2, v0 offset:10
+; CI-NEXT:    s_endpgm
+;
+; GFX9-MUBUF-LABEL: tied_operand_test:
+; GFX9-MUBUF:       ; %bb.0: ; %entry
+; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s9
+; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4
+; GFX9-MUBUF-NEXT:    s_load_dword s4, s[4:5], 0x4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX9-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-MUBUF-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-MUBUF-NEXT:    ds_write_b16 v1, v2 offset:8
+; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT:    ds_write_b16 v1, v0 offset:10
+; GFX9-MUBUF-NEXT:    s_endpgm
+;
+; GFX9-FLATSCR-LABEL: tied_operand_test:
+; GFX9-FLATSCR:       ; %bb.0: ; %entry
+; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s2, 0
+; GFX9-FLATSCR-NEXT:    scratch_load_ushort v0, off, s2 offset:4
+; GFX9-FLATSCR-NEXT:    s_load_dword s0, s[0:1], 0x4
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-FLATSCR-NEXT:    ds_write_b16 v1, v2 offset:8
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    ds_write_b16 v1, v0 offset:10
+; GFX9-FLATSCR-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: tied_operand_test:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-DAG:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off offset:4
-; GFX11-DAG:     v_mov_b32_e32 [[C:v[0-9]+]], 0x7b
-; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
-; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[C]]  offset:8
+; GFX11-NEXT:    scratch_load_u16 v0, off, off offset:4
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x4
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    ds_store_b16 v1, v2 offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ds_store_b16 v1, v0 offset:10
 ; GFX11-NEXT:    s_endpgm
-define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) {
 entry:
   %scratch0 = alloca i16, align 4, addrspace(5)
   %scratch1 = alloca i16, align 4, addrspace(5)
@@ -333,3 +1174,7 @@ entry:
 }
 
 attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX9: {{.*}}
+; MUBUF: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b4e9376d8277737..bc9753b753c3d89 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -785,7 +785,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
 ; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_hi_i32 v1, v0, 9
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -804,7 +804,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
 ; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_hi_i32 v1, v0, 9
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
 ; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -823,7 +823,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mul_hi_i32 v1, v0, 9
-; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2395,6 +2395,45 @@ entry:
   ret void
 }
 
+define i32 @mul_pow2_plus_1(i32 %val) {
+; SI-LABEL: mul_pow2_plus_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mul_pow2_plus_1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_pow2_plus_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_pow2_plus_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_pow2_plus_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: mul_pow2_plus_1:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+  %mul = mul i32 %val, 9
+  ret i32 %mul
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 930ba80ad69638d..d4e936e34a29150 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -141,9 +141,9 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
   ; SI-NEXT: bb.2.Flow:
   ; SI-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4
-  ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4
-  ; SI-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4
+  ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %31:vgpr_32, %bb.1, %10, %bb.4
+  ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %9, %bb.4
+  ; SI-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %34:vgpr_32, %bb.4
   ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.3
   ; SI-NEXT: {{  $}}
@@ -158,7 +158,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
   ; SI-NEXT:   successors: %bb.2(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY2]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec
-  ; SI-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
+  ; SI-NEXT:   [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 killed [[PHI1]], 1, [[PHI1]], implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.5.if.end:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index c71dc06c68d8d68..9183f043f052cb4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -92,20 +92,20 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
 ; SI-NEXT:  .LBB2_1: ; %if.end
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; SI-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; SI-NEXT:    v_add_nc_u32_e32 v2, 1, v0
+; SI-NEXT:    v_add_nc_u32_e32 v2, 1, v3
 ; SI-NEXT:    s_add_i32 s1, s1, 1
 ; SI-NEXT:    s_cmp_lt_i32 s1, s0
 ; SI-NEXT:    s_cbranch_scc0 .LBB2_6
 ; SI-NEXT:  .LBB2_2: ; %for.body
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; SI-NEXT:    s_xor_b32 s2, exec_lo, s2
 ; SI-NEXT:  ; %bb.3: ; %else
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SI-NEXT:    v_mul_lo_u32 v0, v2, 3
-; SI-NEXT:    v_mul_f32_e32 v3, v1, v2
+; SI-NEXT:    v_mul_f32_e32 v0, v1, v2
+; SI-NEXT:    v_lshl_add_u32 v3, v2, 1, v2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:  ; %bb.4: ; %Flow
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
@@ -113,11 +113,11 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
 ; SI-NEXT:    s_cbranch_execz .LBB2_1
 ; SI-NEXT:  ; %bb.5: ; %if
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SI-NEXT:    v_mul_f32_e32 v3, s1, v1
-; SI-NEXT:    v_add_nc_u32_e32 v0, 1, v2
+; SI-NEXT:    v_mul_f32_e32 v0, s1, v1
+; SI-NEXT:    v_add_nc_u32_e32 v3, 1, v2
 ; SI-NEXT:    s_branch .LBB2_1
 ; SI-NEXT:  .LBB2_6: ; %for.end
-; SI-NEXT:    v_add_f32_e32 v0, v0, v3
+; SI-NEXT:    v_add_f32_e32 v0, v3, v0
 ; SI-NEXT:    ; return to shader part epilog
 entry:
 ;  %break = icmp sgt i32 %bound, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 95f947cbca14f05..6bb066f06dd9a24 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1536,7 +1536,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX9-W64-NEXT:  ; %bb.2: ; %Flow
 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
 ; GFX9-W64-NEXT:  ; %bb.3: ; %IF
-; GFX9-W64-NEXT:    v_mul_lo_u32 v0, v5, 3
+; GFX9-W64-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
 ; GFX9-W64-NEXT:  ; %bb.4: ; %END
 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -1566,7 +1566,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX10-W32-NEXT:  ; %bb.2: ; %Flow
 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
 ; GFX10-W32-NEXT:  ; %bb.3: ; %IF
-; GFX10-W32-NEXT:    v_mul_lo_u32 v0, v5, 3
+; GFX10-W32-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
 ; GFX10-W32-NEXT:  ; %bb.4: ; %END
 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12



More information about the llvm-commits mailing list