[llvm] 4a3b055 - [AMDGPU] Fix flags of V_MOV_B64_PSEUDO

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 9 12:49:38 PDT 2021


Author: Stanislav Mekhanoshin
Date: 2021-07-09T12:49:28-07:00
New Revision: 4a3b0556536d5a2555f7a19f953f0eec0f79f1a9

URL: https://github.com/llvm/llvm-project/commit/4a3b0556536d5a2555f7a19f953f0eec0f79f1a9
DIFF: https://github.com/llvm/llvm-project/commit/4a3b0556536d5a2555f7a19f953f0eec0f79f1a9.diff

LOG: [AMDGPU] Fix flags of V_MOV_B64_PSEUDO

In particular it was not rematerializable.

Differential Revision: https://reviews.llvm.org/D105724

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/addrspacecast.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
    llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
    llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/remat-vop.mir
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/shift-i128.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1d086d14a00c..0475963066a2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -104,7 +104,13 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
 // 64-bit vector move instruction. This is mainly used by the
 // SIFoldOperands pass to enable folding of inline immediates.
 def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
-                                      (ins VSrc_b64:$src0)>;
+                                      (ins VSrc_b64:$src0)> {
+  let isReMaterializable = 1;
+  let isAsCheapAsAMove = 1;
+  let isMoveImm = 1;
+  let SchedRW = [Write64Bit];
+  let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each.
+}
 
 // 64-bit vector move with dpp. Expanded post-RA.
 def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> {

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index b4b0037ab677..7ab87e4275cb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -18,8 +18,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX9-NEXT:    s_or_b32 s0, s0, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -36,8 +36,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX8-NEXT:    s_or_b32 s0, s0, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -54,8 +54,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX7-NEXT:    s_or_b32 s0, s0, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -85,19 +85,19 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %
 define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
 ; GFX9-LABEL: insertelement_v_v2i16_s_s:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_and_b32 s1, s3, 1
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s2, s2, s0
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
-; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
-; GFX9-NEXT:    s_not_b32 s0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v2, v0, s0, v1
+; GFX9-NEXT:    global_load_dword v2, v[0:1], off
+; GFX9-NEXT:    s_and_b32 s0, s3, 1
+; GFX9-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX9-NEXT:    s_and_b32 s2, s2, s1
+; GFX9-NEXT:    s_lshl_b32 s2, s2, s0
+; GFX9-NEXT:    s_lshl_b32 s0, s1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_not_b32 s0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, v2, s0, v3
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -112,10 +112,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    s_not_b32 s0, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, s0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -130,16 +130,16 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    s_not_b32 s0, s0
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX7-NEXT:    v_or_b32_e32 v2, s2, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s0, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_s_s:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dword v2, v[0:1], off
 ; GFX10-NEXT:    s_and_b32 s0, s3, 1
 ; GFX10-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
@@ -147,10 +147,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX10-NEXT:    s_lshl_b32 s0, s2, s0
 ; GFX10-NEXT:    s_not_b32 s1, s1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s1, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v2, v2, s1, s0
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i16>, <2 x i16> addrspace(1 )* %ptr
@@ -166,14 +166,14 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s1, s4, 1
 ; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, s2, v0
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_andn2_b32 s0, s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_lshl_or_b32 v2, v0, s1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_lshl_or_b32 v2, v2, s1, v3
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -185,12 +185,12 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX8-NEXT:    v_or_b32_e32 v2, s0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -201,13 +201,13 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v0
 ; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX7-NEXT:    v_or_b32_e32 v2, s0, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_or_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -217,13 +217,13 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_and_b32 s1, s4, 1
 ; GFX10-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX10-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, s2, v0
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, s1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s2
-; GFX10-NEXT:    v_lshl_or_b32 v2, v0, s1, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s2
+; GFX10-NEXT:    v_lshl_or_b32 v2, v2, s1, s0
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
@@ -240,13 +240,13 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-NEXT:    s_and_b32 s2, s4, s1
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v0, s2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s2
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
-; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v2, s0, v0, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -257,14 +257,14 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX8-NEXT:    s_and_b32 s2, s4, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v0, s2
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s2
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -275,14 +275,14 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX7-NEXT:    s_and_b32 s2, s4, s1
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s2, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v2, s2, v0
 ; GFX7-NEXT:    v_lshl_b32_e32 v0, s1, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX7-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -294,12 +294,12 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s1
 ; GFX10-NEXT:    s_and_b32 s1, s4, s1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
-; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v2, s0, v1, v0
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v2, s0, v3, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
@@ -315,13 +315,13 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX9-NEXT:    s_mov_b32 s1, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s1
-; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v2, s0, v1, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v1, s1
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -331,14 +331,14 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX8-NEXT:    s_mov_b32 s1, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s1
-; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v1, s1
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -349,14 +349,14 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v1, s1, v1
-; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v0, s1, v1
+; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX7-NEXT:    v_or_b32_e32 v2, v1, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -367,12 +367,12 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v2, s0, v1, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v2, s0, v2, v3
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
@@ -384,18 +384,18 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %
 define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
 ; GFX9-LABEL: insertelement_v_v2i16_s_v:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    global_load_dword v3, v[0:1], off
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-NEXT:    s_and_b32 s1, s2, s0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v2, v0, v1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
+; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, v3, v4, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -410,10 +410,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -428,27 +428,27 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_s_v:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    s_mov_b32 s0, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s0
 ; GFX10-NEXT:    s_and_b32 s0, s2, s0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v2, v0, v2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
+; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v2, v3, v4, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr
@@ -460,17 +460,17 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i
 define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
 ; GFX9-LABEL: insertelement_v_v2i16_v_s:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_and_b32 s1, s2, 1
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    global_load_dword v3, v[0:1], off
+; GFX9-NEXT:    s_and_b32 s0, s2, 1
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX9-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    s_lshl_b32 s0, s1, s0
 ; GFX9-NEXT:    s_not_b32 s0, s0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v2, v0, s0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, v3, s0, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -483,12 +483,12 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_not_b32 s0, s0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -501,28 +501,28 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v1, s0, v2
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX7-NEXT:    s_not_b32 s0, s0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s1, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX7-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_v_s:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    s_and_b32 s0, s2, 1
 ; GFX10-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_not_b32 s0, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_and_or_b32 v2, v3, s0, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr
@@ -534,17 +534,17 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i
 define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
 ; GFX9-LABEL: insertelement_v_v2i16_v_v:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v2, v0, v1, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
+; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v2, v4, v3, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -558,10 +558,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -576,26 +576,26 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_v_v:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
 ; GFX10-NEXT:    s_mov_b32 s0, 0xffff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v3
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v2, v0, v2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v2, v4, v3, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr
@@ -674,15 +674,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX9-NEXT:    s_not_b32 s0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v3, s0, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s0, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -698,15 +698,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX8-NEXT:    s_not_b32 s0, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v2, s0, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, s2, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -722,15 +722,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX7-NEXT:    s_not_b32 s0, s0
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX7-NEXT:    v_or_b32_e32 v4, s2, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -748,12 +748,12 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    s_not_b32 s0, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s0, s2
+; GFX10-NEXT:    v_and_or_b32 v4, v2, s0, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s1, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr
@@ -777,15 +777,15 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, s4
 ; GFX9-NEXT:    s_andn2_b32 s3, s3, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_lshl_or_b32 v2, v0, s4, v1
+; GFX9-NEXT:    v_lshl_or_b32 v4, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -804,14 +804,14 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_andn2_b32 s3, s3, s4
-; GFX8-NEXT:    v_or_b32_e32 v2, s3, v0
+; GFX8-NEXT:    v_or_b32_e32 v4, s3, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -829,15 +829,15 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    s_lshl_b32 s4, s5, s4
 ; GFX7-NEXT:    s_andn2_b32 s3, s3, s4
-; GFX7-NEXT:    v_or_b32_e32 v2, s3, v0
+; GFX7-NEXT:    v_or_b32_e32 v4, s3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -857,12 +857,12 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, s4
 ; GFX10-NEXT:    s_andn2_b32 s3, s3, s5
-; GFX10-NEXT:    v_lshl_or_b32 v2, v2, s4, s3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshl_or_b32 v4, v2, s4, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
@@ -888,14 +888,14 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX9-NEXT:    v_and_or_b32 v3, v1, v0, v3
+; GFX9-NEXT:    v_and_or_b32 v4, v1, v0, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -916,14 +916,14 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v4, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -944,14 +944,14 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_lshl_b32_e32 v0, s2, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -959,25 +959,25 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
 ; GFX10-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-NEXT:    s_and_b32 s3, s4, s2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s2
-; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v1, s3
+; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s3
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, s0, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v3
-; GFX10-NEXT:    v_and_or_b32 v2, v5, v2, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
+; GFX10-NEXT:    v_and_or_b32 v5, v5, v2, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
@@ -1002,14 +1002,14 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX9-NEXT:    v_and_or_b32 v3, v3, v1, v0
+; GFX9-NEXT:    v_and_or_b32 v4, v3, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1029,14 +1029,14 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v0
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1057,14 +1057,14 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
-; GFX7-NEXT:    v_or_b32_e32 v3, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v4, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -1085,11 +1085,11 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
-; GFX10-NEXT:    v_and_or_b32 v2, v5, v3, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_and_or_b32 v5, v5, v3, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
@@ -1102,94 +1102,94 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v4i16_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    s_and_b32 s1, s2, s0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v4, v2, s1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v5, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v7, v2, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v4i16_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v2, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v2, v5, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, v7, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_s_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    s_and_b32 s1, s2, s0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, s1, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v6, s1, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
 ; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, v7, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_s_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
 ; GFX10-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
 ; GFX10-NEXT:    s_and_b32 s0, s2, s0
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
-; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v4
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v3, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v5
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v3, v5, v4, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
+; GFX10-NEXT:    v_and_or_b32 v4, v4, v3, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
@@ -1210,15 +1210,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    s_not_b32 s0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v3, s0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v5, s0, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v4i16_v_s:
@@ -1229,20 +1229,20 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 4
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_not_b32 s0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_v_s:
@@ -1257,16 +1257,16 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX7-NEXT:    s_not_b32 s0, s0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s2, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_v_s:
@@ -1282,12 +1282,12 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    s_not_b32 s0, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v2, v3, s0, v2
+; GFX10-NEXT:    v_and_or_b32 v4, v3, s0, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s2, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
@@ -1300,91 +1300,91 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i
 ; GFX9-LABEL: insertelement_v_v4i16_v_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v5, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v7, v3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v4i16_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX8-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, v5, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, v7, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_v_v:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX7-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v3, s0, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX7-NEXT:    v_xor_b32_e32 v3, -1, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v3, v5, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_v_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
 ; GFX10-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v3
-; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v6
+; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v2, v5, v4, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
+; GFX10-NEXT:    v_and_or_b32 v4, v4, v3, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
@@ -1423,10 +1423,10 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1459,10 +1459,10 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1558,20 +1558,20 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
 ; GFX9-NEXT:    s_not_b32 s5, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v4, v5, s5, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v6, v7, s5, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1589,19 +1589,19 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    s_not_b32 s6, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v4, s6, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, s5, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v6, s6, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, s5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1655,14 +1655,14 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s1
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s3, s2
+; GFX10-NEXT:    v_and_or_b32 v6, v4, s3, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s1
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i16>, <8 x i16> addrspace(1 )* %ptr
@@ -1690,21 +1690,21 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, s4
 ; GFX9-NEXT:    s_andn2_b32 s6, s6, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_lshl_or_b32 v4, v0, s4, v1
+; GFX9-NEXT:    v_lshl_or_b32 v6, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1727,20 +1727,20 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_lshl_b32 s4, s7, s4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_andn2_b32 s4, s6, s4
-; GFX8-NEXT:    v_or_b32_e32 v4, s4, v0
+; GFX8-NEXT:    v_or_b32_e32 v6, s4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1803,16 +1803,16 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, s4
 ; GFX10-NEXT:    s_andn2_b32 s6, s6, s7
-; GFX10-NEXT:    v_lshl_or_b32 v4, v4, s4, s6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_lshl_or_b32 v6, v4, s4, s6
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
@@ -1844,18 +1844,18 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX9-NEXT:    v_and_or_b32 v5, v1, v0, v2
+; GFX9-NEXT:    v_and_or_b32 v6, v1, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1882,18 +1882,18 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v6, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1940,17 +1940,17 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX10-NEXT:    s_and_b32 s1, s4, s0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v4
-; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v1, s1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v4
-; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
+; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v1, s1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
+; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, s8, v0, vcc_lo
@@ -1960,13 +1960,13 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s11
-; GFX10-NEXT:    v_and_or_b32 v5, v7, v6, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s1
+; GFX10-NEXT:    v_and_or_b32 v7, v7, v5, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
@@ -1997,18 +1997,18 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX9-NEXT:    v_and_or_b32 v5, v2, v1, v0
+; GFX9-NEXT:    v_and_or_b32 v6, v2, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2034,18 +2034,18 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX8-NEXT:    v_or_b32_e32 v6, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2091,17 +2091,17 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX10-LABEL: insertelement_s_v8i16_v_v:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v1
 ; GFX10-NEXT:    s_mov_b32 s0, 0xffff
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v2, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, s4, v1, vcc_lo
@@ -2111,13 +2111,13 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s7
-; GFX10-NEXT:    v_and_or_b32 v5, v7, v6, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s1
+; GFX10-NEXT:    v_and_or_b32 v7, v7, v5, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
@@ -2141,19 +2141,19 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v7, v7, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v9, v9, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
+; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v8i16_s_v:
@@ -2170,20 +2170,20 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v1, v7, v1
-; GFX8-NEXT:    v_or_b32_e32 v7, v1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v1, v9, v1
+; GFX8-NEXT:    v_or_b32_e32 v9, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v8i16_s_v:
@@ -2237,14 +2237,14 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s1
-; GFX10-NEXT:    v_and_or_b32 v7, v2, v7, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s1
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT:    v_and_or_b32 v9, v2, v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
+; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -2266,19 +2266,19 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    s_not_b32 s5, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v7, v1, s5, v0
+; GFX9-NEXT:    v_and_or_b32 v9, v1, s5, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
+; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v8i16_v_s:
@@ -2295,20 +2295,20 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
 ; GFX8-NEXT:    v_and_b32_e32 v1, s5, v1
-; GFX8-NEXT:    v_or_b32_e32 v7, v1, v0
+; GFX8-NEXT:    v_or_b32_e32 v9, v1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v8i16_v_s:
@@ -2355,20 +2355,20 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    s_lshl_b32 s2, s4, s2
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX10-NEXT:    s_not_b32 s2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s1
-; GFX10-NEXT:    v_and_or_b32 v7, v0, s2, v1
+; GFX10-NEXT:    v_and_or_b32 v9, v0, s2, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s3, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s1
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
+; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -2390,7 +2390,9 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
@@ -2398,11 +2400,9 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_and_or_b32 v3, v3, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v8i16_v_v:
@@ -2418,7 +2418,9 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
@@ -2427,11 +2429,9 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v8i16_v_v:
@@ -2480,6 +2480,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s0
@@ -2487,11 +2489,9 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_and_or_b32 v3, v3, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -2544,10 +2544,10 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
 ; GFX9-NEXT:    s_cselect_b32 s6, s16, s14
 ; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_cselect_b32 s7, s16, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -2603,18 +2603,18 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 6
 ; GFX8-NEXT:    s_cselect_b32 s6, s16, s14
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 7
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_cselect_b32 s7, s16, s15
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3050,12 +3050,11 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_s_v16i16_v_s:
@@ -3295,12 +3294,11 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_s_v16i16_s_v:
@@ -3535,12 +3533,11 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
+; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_s_v16i16_v_v:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index b2d8b512cf3c..9a658af63641 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -12,15 +12,15 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -32,15 +32,15 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -75,13 +75,13 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, s4, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s0
 ; GFX10-NEXT:    s_movk_i32 s0, 0xff
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr
@@ -97,15 +97,15 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -115,15 +115,15 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -156,13 +156,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s3, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, s2, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s2, s0
 ; GFX10-NEXT:    s_movk_i32 s0, 0xff
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i8>, <2 x i8> addrspace(1 )* %ptr
@@ -178,15 +178,15 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX9-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -197,15 +197,15 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    flat_load_ushort v1, v[1:2]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -241,11 +241,11 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_and_b32_sdwa v1, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr
@@ -262,15 +262,15 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX9-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -282,15 +282,15 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -327,11 +327,11 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s4, vcc_lo
-; GFX10-NEXT:    v_and_b32_sdwa v1, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v1, s4, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr
@@ -352,10 +352,10 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(<2 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -371,10 +371,10 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(<2 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -410,11 +410,11 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(<2 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_and_b32_sdwa v1, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr
@@ -430,15 +430,15 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -448,15 +448,15 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -491,11 +491,11 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, s2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr
@@ -510,15 +510,15 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -527,15 +527,15 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -569,11 +569,11 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr
@@ -588,15 +588,15 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -605,15 +605,15 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -647,11 +647,11 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(<2 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr
@@ -744,11 +744,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX9-NEXT:    v_or3_b32 v2, v0, v2, v1
+; GFX9-NEXT:    v_and_or_b32 v4, v0, s4, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or3_b32 v2, v4, v2, v3
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -780,11 +780,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -850,14 +850,14 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    s_not_b32 s2, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, s0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-NEXT:    v_or3_b32 v2, v0, v2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX10-NEXT:    v_and_or_b32 v4, v0, s1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v4, v2, v3
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i8>, <4 x i8> addrspace(1 )* %ptr
@@ -891,14 +891,14 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_andn2_b32 s0, s0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, s3, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s5, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX9-NEXT:    v_or3_b32 v2, v2, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or3_b32 v2, v2, v3, v4
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -930,11 +930,11 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1002,11 +1002,11 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; GFX10-NEXT:    v_or3_b32 v2, v0, v3, v1
+; GFX10-NEXT:    v_and_or_b32 v4, v0, s2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v4, v3, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
@@ -1039,15 +1039,15 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_and_or_b32 v0, s0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    s_mov_b32 s2, 16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s5, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX9-NEXT:    v_or3_b32 v2, v2, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or3_b32 v2, v2, v3, v4
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1080,11 +1080,11 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1154,11 +1154,11 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; GFX10-NEXT:    v_or3_b32 v2, v0, v3, v1
+; GFX10-NEXT:    v_and_or_b32 v4, v0, s1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v4, v3, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
@@ -1190,15 +1190,15 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_and_or_b32 v0, s0, v1, v0
 ; GFX9-NEXT:    s_mov_b32 s1, 8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    s_mov_b32 s2, 16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s4, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX9-NEXT:    v_or3_b32 v2, v2, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or3_b32 v2, v2, v3, v4
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1230,11 +1230,11 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1303,11 +1303,11 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; GFX10-NEXT:    v_or3_b32 v2, v0, v3, v1
+; GFX10-NEXT:    v_and_or_b32 v4, v0, s1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v4, v3, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
@@ -1342,11 +1342,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; GFX9-NEXT:    v_or3_b32 v2, v0, v3, v1
+; GFX9-NEXT:    v_and_or_b32 v4, v0, s3, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or3_b32 v2, v4, v3, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1378,11 +1378,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1450,12 +1450,12 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
-; GFX10-NEXT:    v_or3_b32 v2, v0, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v2, v0, s1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v2, v4, v3
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
@@ -1489,11 +1489,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; GFX9-NEXT:    v_or3_b32 v2, v0, v3, v1
+; GFX9-NEXT:    v_and_or_b32 v4, v0, s3, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or3_b32 v2, v4, v3, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1525,11 +1525,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1594,14 +1594,14 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 16
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s0, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
-; GFX10-NEXT:    v_or3_b32 v2, v0, v2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX10-NEXT:    v_and_or_b32 v4, v0, s0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v4, v2, v3
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
@@ -1636,11 +1636,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v1, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; GFX9-NEXT:    v_or3_b32 v2, v0, v4, v1
+; GFX9-NEXT:    v_and_or_b32 v3, v0, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_or3_b32 v2, v3, v4, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1671,11 +1671,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1743,12 +1743,12 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
-; GFX10-NEXT:    v_or3_b32 v2, v0, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v0, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v2, v4, v3
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
@@ -1763,8 +1763,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX9-NEXT:    s_mov_b32 s8, 0x80008
 ; GFX9-NEXT:    s_movk_i32 s6, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s9, s0, s8
 ; GFX9-NEXT:    s_and_b32 s7, s0, s6
@@ -1821,9 +1821,9 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_or_b32 s1, s2, s1
 ; GFX9-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_s_v8i8_s_s:
@@ -1831,8 +1831,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX8-NEXT:    s_mov_b32 s8, 0x80008
 ; GFX8-NEXT:    s_movk_i32 s6, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s9, s0, s8
 ; GFX8-NEXT:    s_and_b32 s7, s0, s6
@@ -1889,9 +1889,9 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_s_v8i8_s_s:
@@ -1970,8 +1970,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-NEXT:    s_mov_b32 s6, 0x80010
 ; GFX10-NEXT:    s_lshr_b32 s7, s5, 2
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_u32 s11, s0, s3
 ; GFX10-NEXT:    s_bfe_u32 s13, s1, s3
@@ -2009,26 +2009,26 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    s_bfe_u32 s7, s0, s3
 ; GFX10-NEXT:    s_bfe_u32 s3, s1, s3
 ; GFX10-NEXT:    s_and_b32 s5, s0, s2
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX10-NEXT:    s_bfe_u32 s0, s0, s6
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX10-NEXT:    s_and_b32 s2, s1, s2
 ; GFX10-NEXT:    s_bfe_u32 s1, s1, s6
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX10-NEXT:    s_or_b32 s5, s5, s7
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX10-NEXT:    s_bfe_u32 s0, s0, s6
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX10-NEXT:    s_or_b32 s2, s2, s3
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX10-NEXT:    s_or_b32 s0, s5, s0
-; GFX10-NEXT:    s_lshl_b32 s3, s4, 24
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX10-NEXT:    s_or_b32 s5, s5, s7
 ; GFX10-NEXT:    s_or_b32 s1, s2, s1
 ; GFX10-NEXT:    s_lshl_b32 s2, s8, 24
-; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_or_b32 s0, s5, s0
+; GFX10-NEXT:    s_lshl_b32 s3, s4, 24
 ; GFX10-NEXT:    s_or_b32 s1, s1, s2
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -2051,49 +2051,49 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    s_lshl_b32 s3, s4, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
 ; GFX9-NEXT:    s_not_b32 s3, s3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v5
-; GFX9-NEXT:    v_or3_b32 v1, v1, v10, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v4, v5, s3, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
-; GFX9-NEXT:    v_or3_b32 v1, v1, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-NEXT:    v_mov_b32_e32 v5, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v7
+; GFX9-NEXT:    v_or3_b32 v1, v1, v12, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v6, v7, s3, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v6
+; GFX9-NEXT:    v_or3_b32 v1, v1, v5, v4
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v8i8_s_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v2, 8
-; GFX8-NEXT:    v_mov_b32_e32 v3, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_mov_b32_e32 v5, 16
 ; GFX8-NEXT:    s_lshr_b32 s1, s3, 2
 ; GFX8-NEXT:    s_and_b32 s3, s3, 3
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
@@ -2103,45 +2103,45 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX8-NEXT:    s_not_b32 s0, s0
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
-; GFX8-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-NEXT:    v_mov_b32_e32 v5, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, 8
+; GFX8-NEXT:    v_mov_b32_e32 v7, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v7
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v2, s0, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, s0, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, s2, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2245,21 +2245,21 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v5
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_or3_b32 v0, v0, v7, v4
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_and_or_b32 v3, v1, s4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v2, v7, v4
+; GFX10-NEXT:    v_or3_b32 v3, v3, v8, v5
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(1 )* %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -2321,12 +2321,12 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v2, v1, s7, v2
+; GFX9-NEXT:    v_and_or_b32 v4, v1, s7, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2387,10 +2387,10 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2510,18 +2510,18 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, v0, s2, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s2, v5
-; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v2
-; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v3
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX10-NEXT:    v_and_or_b32 v5, v1, s2, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_or3_b32 v2, v3, v6, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v3, v5, v7, v4
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -2584,12 +2584,12 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v2, v1, s7, v2
+; GFX9-NEXT:    v_and_or_b32 v4, v1, s7, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2652,10 +2652,10 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2781,15 +2781,15 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, v0, s2, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s2, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v2
-; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v3
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_and_or_b32 v5, v1, s2, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v3, v6, v2
+; GFX10-NEXT:    v_or3_b32 v3, v5, v7, v4
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -2851,12 +2851,12 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v2, v1, s6, v2
+; GFX9-NEXT:    v_and_or_b32 v4, v1, s6, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_or3_b32 v1, v4, v1, v5
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2918,10 +2918,10 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3046,15 +3046,15 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s2, v3
+; GFX10-NEXT:    v_and_or_b32 v3, v0, s2, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s2, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX10-NEXT:    v_or3_b32 v0, v0, v6, v2
-; GFX10-NEXT:    v_or3_b32 v1, v1, v7, v3
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_and_or_b32 v5, v1, s2, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v3, v6, v2
+; GFX10-NEXT:    v_or3_b32 v3, v5, v7, v4
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -3068,106 +3068,106 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 2, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX9-NEXT:    s_movk_i32 s3, 0xff
 ; GFX9-NEXT:    s_and_b32 s2, s2, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v6, v2, s2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v8, v2, s2
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, 8
-; GFX9-NEXT:    v_mov_b32_e32 v4, 16
+; GFX9-NEXT:    v_mov_b32_e32 v5, 8
+; GFX9-NEXT:    v_mov_b32_e32 v6, 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v7
-; GFX9-NEXT:    v_or3_b32 v1, v1, v12, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v7, v2, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_or3_b32 v0, v0, v12, v9
+; GFX9-NEXT:    v_or3_b32 v1, v1, v14, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v9, v2, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v2
-; GFX9-NEXT:    v_or3_b32 v1, v1, v4, v3
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v2
+; GFX9-NEXT:    v_or3_b32 v1, v1, v6, v5
+; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v8i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v3, 8
-; GFX8-NEXT:    v_mov_b32_e32 v4, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 2, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, 8
+; GFX8-NEXT:    v_mov_b32_e32 v6, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 2, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e64 v8, v2, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v10, v2, s1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v5, 8
-; GFX8-NEXT:    v_mov_b32_e32 v6, 16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v9
+; GFX8-NEXT:    v_mov_b32_e32 v7, 8
+; GFX8-NEXT:    v_mov_b32_e32 v8, 16
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v10
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v2, v3, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v12
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v8i8_s_v:
@@ -3271,20 +3271,20 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s3, v5
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s3, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v6
+; GFX10-NEXT:    v_and_or_b32 v5, v0, s3, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_or3_b32 v1, v1, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_or3_b32 v0, v0, v7, v4
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_and_or_b32 v8, v1, s3, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v5, v7, v4
+; GFX10-NEXT:    v_or3_b32 v3, v8, v3, v6
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -3306,41 +3306,41 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX9-NEXT:    s_not_b32 s2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, 8
-; GFX9-NEXT:    v_mov_b32_e32 v4, 16
+; GFX9-NEXT:    v_mov_b32_e32 v5, 8
+; GFX9-NEXT:    v_mov_b32_e32 v6, 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v5
-; GFX9-NEXT:    v_or3_b32 v1, v1, v10, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v5, s2, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v7
+; GFX9-NEXT:    v_or3_b32 v1, v1, v12, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v7, s2, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v2
-; GFX9-NEXT:    v_or3_b32 v1, v1, v4, v3
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v2
+; GFX9-NEXT:    v_or3_b32 v1, v1, v6, v5
+; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v8i8_v_s:
@@ -3348,55 +3348,55 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_lshr_b32 s1, s2, 2
 ; GFX8-NEXT:    s_and_b32 s2, s2, 3
-; GFX8-NEXT:    v_mov_b32_e32 v3, 8
+; GFX8-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX8-NEXT:    v_mov_b32_e32 v4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v7, s2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v6, 16
+; GFX8-NEXT:    v_mov_b32_e32 v9, s2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX8-NEXT:    s_not_b32 s0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v5, 8
-; GFX8-NEXT:    v_mov_b32_e32 v6, 16
+; GFX8-NEXT:    v_mov_b32_e32 v7, 8
+; GFX8-NEXT:    v_mov_b32_e32 v8, 16
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, s0, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v8i8_v_s:
@@ -3499,20 +3499,20 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s3, v5
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_or3_b32 v0, v0, v7, v4
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_and_or_b32 v2, v0, s3, v5
+; GFX10-NEXT:    v_and_or_b32 v3, v1, s3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v2, v7, v4
+; GFX10-NEXT:    v_or3_b32 v3, v3, v8, v5
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -3526,105 +3526,105 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 2, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX9-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 16
+; GFX9-NEXT:    v_mov_b32_e32 v7, 8
+; GFX9-NEXT:    v_mov_b32_e32 v8, 16
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s2, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s2, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX9-NEXT:    v_or3_b32 v0, v0, v11, v8
-; GFX9-NEXT:    v_or3_b32 v1, v1, v13, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v8, v3, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s2, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s2, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_or3_b32 v0, v0, v13, v10
+; GFX9-NEXT:    v_or3_b32 v1, v1, v15, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v10, v3, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v7
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v6, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v5
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v6, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v2
-; GFX9-NEXT:    v_or3_b32 v1, v1, v6, v3
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v2
+; GFX9-NEXT:    v_or3_b32 v1, v1, v8, v3
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v8i8_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 2, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 2, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_mov_b32_e32 v5, 8
-; GFX8-NEXT:    v_mov_b32_e32 v6, 16
+; GFX8-NEXT:    v_mov_b32_e32 v7, 8
+; GFX8-NEXT:    v_mov_b32_e32 v8, 16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v3, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
 ; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT:    v_mov_b32_e32 v7, 8
-; GFX8-NEXT:    v_mov_b32_e32 v8, 16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT:    v_mov_b32_e32 v9, 8
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v10
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v12
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, v6, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v9
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v8i8_v_v:
@@ -3728,21 +3728,21 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v5, v6
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v5, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v7
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v2, v0, v5, v6
+; GFX10-NEXT:    v_and_or_b32 v3, v1, v5, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX10-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_or3_b32 v0, v0, v8, v4
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_or3_b32 v2, v2, v8, v4
+; GFX10-NEXT:    v_or3_b32 v3, v3, v9, v5
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -4235,90 +4235,90 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    s_mov_b32 s1, 16
-; GFX9-NEXT:    v_mov_b32_e32 v4, 8
+; GFX9-NEXT:    v_mov_b32_e32 v6, 8
 ; GFX9-NEXT:    s_movk_i32 s6, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v5, 16
 ; GFX9-NEXT:    s_lshr_b32 s4, s3, 2
 ; GFX9-NEXT:    s_and_b32 s3, s3, 3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 16
 ; GFX9-NEXT:    s_and_b32 s2, s2, s6
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX9-NEXT:    s_lshl_b32 s3, s6, s3
 ; GFX9-NEXT:    s_not_b32 s5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v3
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX9-NEXT:    v_or3_b32 v0, v0, v12, v7
-; GFX9-NEXT:    v_or3_b32 v1, v1, v14, v8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v17
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-NEXT:    v_or3_b32 v2, v2, v16, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_or3_b32 v0, v0, v14, v9
+; GFX9-NEXT:    v_or3_b32 v1, v1, v16, v10
+; GFX9-NEXT:    v_and_or_b32 v13, v3, s6, v19
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX9-NEXT:    v_or3_b32 v2, v2, v18, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
-; GFX9-NEXT:    v_or3_b32 v3, v3, v18, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v6, v7, s5, v6
+; GFX9-NEXT:    v_or3_b32 v3, v13, v3, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v3, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v8, v9, s5, v8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
-; GFX9-NEXT:    v_or3_b32 v3, v3, v5, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v11
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-NEXT:    v_or3_b32 v0, v0, v11, v6
-; GFX9-NEXT:    v_or3_b32 v1, v1, v13, v7
-; GFX9-NEXT:    v_or3_b32 v2, v2, v15, v8
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_or3_b32 v0, v0, v13, v8
+; GFX9-NEXT:    v_or3_b32 v1, v1, v15, v9
+; GFX9-NEXT:    v_or3_b32 v2, v2, v17, v10
+; GFX9-NEXT:    v_or3_b32 v3, v3, v7, v6
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v16i8_s_s:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 8
-; GFX8-NEXT:    v_mov_b32_e32 v5, 16
+; GFX8-NEXT:    v_mov_b32_e32 v8, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 16
+; GFX8-NEXT:    v_mov_b32_e32 v9, 16
 ; GFX8-NEXT:    s_and_b32 s1, s3, 3
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s4, s3, 2
@@ -4330,75 +4330,75 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    s_not_b32 s6, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v19
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v6, s6, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, s5, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v15
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v4, s6, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, s5, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v8
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v6
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v9
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v8
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -4566,32 +4566,32 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v3, v3, s4, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX10-NEXT:    v_and_or_b32 v1, v1, s4, v12
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX10-NEXT:    v_and_or_b32 v2, v2, s4, v14
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_or3_b32 v3, v3, v5, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v11, v6
 ; GFX10-NEXT:    v_or3_b32 v1, v1, v13, v7
 ; GFX10-NEXT:    v_or3_b32 v2, v2, v15, v8
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_or3_b32 v3, v3, v16, v9
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <16 x i8>, <16 x i8> addrspace(1 )* %ptr
@@ -4692,18 +4692,18 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 16
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v5, v2, s11, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX9-NEXT:    v_and_or_b32 v5, v2, s11, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
-; GFX9-NEXT:    v_and_or_b32 v4, v3, s11, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX9-NEXT:    v_or3_b32 v3, v4, v3, v5
+; GFX9-NEXT:    v_and_or_b32 v6, v3, s11, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_or3_b32 v3, v6, v3, v7
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4798,22 +4798,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5036,8 +5036,8 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_or3_b32 v1, v9, v1, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_or3_b32 v2, v11, v2, v7
-; GFX10-NEXT:    v_or3_b32 v3, v10, v3, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_or3_b32 v3, v10, v3, v8
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr
@@ -5139,18 +5139,18 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 16
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v5, v2, s12, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX9-NEXT:    v_and_or_b32 v5, v2, s12, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
-; GFX9-NEXT:    v_and_or_b32 v4, v3, s12, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX9-NEXT:    v_or3_b32 v3, v4, v3, v5
+; GFX9-NEXT:    v_and_or_b32 v6, v3, s12, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_or3_b32 v3, v6, v3, v7
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5247,22 +5247,22 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5486,8 +5486,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_or3_b32 v1, v9, v1, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_or3_b32 v2, v11, v2, v7
-; GFX10-NEXT:    v_or3_b32 v3, v10, v3, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_or3_b32 v3, v10, v3, v8
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr
@@ -5588,18 +5588,18 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 16
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v5, v2, s10, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX9-NEXT:    v_and_or_b32 v5, v2, s10, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
-; GFX9-NEXT:    v_and_or_b32 v4, v3, s10, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; GFX9-NEXT:    v_or3_b32 v3, v4, v3, v5
+; GFX9-NEXT:    v_and_or_b32 v6, v3, s10, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_or3_b32 v3, v6, v3, v7
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5695,22 +5695,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5933,8 +5933,8 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX10-NEXT:    v_or3_b32 v1, v9, v1, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_or3_b32 v2, v11, v2, v7
-; GFX10-NEXT:    v_or3_b32 v3, v10, v3, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_or3_b32 v3, v10, v3, v8
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr
@@ -5951,17 +5951,9 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    s_mov_b32 s1, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    s_movk_i32 s6, 0xff
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 2, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 16
-; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    s_and_b32 s2, s2, s6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v8, v2, s2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s6
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v7
-; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v7
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
@@ -5970,80 +5962,80 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 2, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    s_and_b32 s0, s2, s6
 ; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; GFX9-NEXT:    v_or3_b32 v3, v3, v14, v9
 ; GFX9-NEXT:    v_or3_b32 v4, v4, v16, v10
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX9-NEXT:    v_lshlrev_b32_e64 v17, v2, s0
 ; GFX9-NEXT:    v_and_or_b32 v13, v6, s6, v19
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
 ; GFX9-NEXT:    v_or3_b32 v5, v5, v18, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v15
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s6
 ; GFX9-NEXT:    v_or3_b32 v6, v13, v6, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v15
+; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v2, v9, v2, v8
+; GFX9-NEXT:    v_and_or_b32 v2, v9, v2, v17
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v12
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v14
-; GFX9-NEXT:    v_and_or_b32 v8, v2, s6, v0
-; GFX9-NEXT:    v_or3_b32 v0, v1, v11, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX9-NEXT:    v_or3_b32 v1, v4, v13, v6
-; GFX9-NEXT:    v_or3_b32 v2, v5, v15, v7
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_or3_b32 v3, v8, v16, v9
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
+; GFX9-NEXT:    v_and_or_b32 v10, v2, s6, v0
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v16
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_or3_b32 v0, v1, v13, v3
+; GFX9-NEXT:    v_or3_b32 v1, v4, v15, v6
+; GFX9-NEXT:    v_or3_b32 v2, v5, v17, v9
+; GFX9-NEXT:    v_or3_b32 v3, v10, v18, v11
+; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v16i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 8
-; GFX8-NEXT:    v_mov_b32_e32 v7, 8
+; GFX8-NEXT:    v_mov_b32_e32 v9, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v8, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 2, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX8-NEXT:    v_lshlrev_b32_e64 v10, v2, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v9
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v9
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@@ -6053,10 +6045,12 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 2, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
@@ -6064,55 +6058,61 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v6
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v18
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v11
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX8-NEXT:    v_lshlrev_b32_e64 v17, v2, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v15
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v15
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
 ; GFX8-NEXT:    v_and_b32_e32 v2, v5, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[2:3]
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v9
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v9, v0, v13
-; GFX8-NEXT:    v_or_b32_e32 v10, v1, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-NEXT:    v_or_b32_e32 v8, v2, v8
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v13
+; GFX8-NEXT:    v_or_b32_e32 v11, v0, v15
+; GFX8-NEXT:    v_or_b32_e32 v12, v1, v17
+; GFX8-NEXT:    v_or_b32_e32 v10, v2, v10
 ; GFX8-NEXT:    v_or_b32_e32 v0, v3, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v9, v5
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_or_b32_e32 v2, v10, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v7
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_or_b32_e32 v1, v11, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v12, v6
+; GFX8-NEXT:    v_or_b32_e32 v3, v10, v9
+; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v16i8_s_v:
@@ -6280,31 +6280,31 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, v0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v2, v2, s3, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s3, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v10, v0, s3, v1
 ; GFX10-NEXT:    v_and_or_b32 v3, v3, s3, v12
+; GFX10-NEXT:    v_and_or_b32 v12, v4, s3, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v5
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v14, v0, s3, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT:    v_or3_b32 v0, v2, v11, v5
-; GFX10-NEXT:    v_or3_b32 v2, v4, v15, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_or3_b32 v0, v2, v11, v10
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v13, v6
-; GFX10-NEXT:    v_or3_b32 v3, v10, v7, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_or3_b32 v2, v12, v15, v8
+; GFX10-NEXT:    v_or3_b32 v3, v14, v7, v9
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
@@ -6321,77 +6321,77 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    s_mov_b32 s1, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX9-NEXT:    s_movk_i32 s6, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 2
 ; GFX9-NEXT:    s_and_b32 s2, s2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    s_lshl_b32 s2, s6, s2
 ; GFX9-NEXT:    s_not_b32 s5, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v6
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX9-NEXT:    v_or3_b32 v3, v3, v12, v7
-; GFX9-NEXT:    v_or3_b32 v4, v4, v14, v8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v6, v6, s6, v17
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-NEXT:    v_or3_b32 v5, v5, v16, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_or3_b32 v3, v3, v14, v9
+; GFX9-NEXT:    v_or3_b32 v4, v4, v16, v10
+; GFX9-NEXT:    v_and_or_b32 v13, v6, s6, v19
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX9-NEXT:    v_or3_b32 v5, v5, v18, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
-; GFX9-NEXT:    v_or3_b32 v6, v6, v18, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v2, v7, s5, v2
+; GFX9-NEXT:    v_or3_b32 v6, v13, v6, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v2, v9, s5, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v12
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v14
-; GFX9-NEXT:    v_and_or_b32 v8, v2, s6, v0
-; GFX9-NEXT:    v_or3_b32 v0, v1, v11, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX9-NEXT:    v_or3_b32 v1, v4, v13, v6
-; GFX9-NEXT:    v_or3_b32 v2, v5, v15, v7
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_or3_b32 v3, v8, v16, v9
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
+; GFX9-NEXT:    v_and_or_b32 v10, v2, s6, v0
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v16
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_or3_b32 v0, v1, v13, v3
+; GFX9-NEXT:    v_or3_b32 v1, v4, v15, v6
+; GFX9-NEXT:    v_or3_b32 v2, v5, v17, v9
+; GFX9-NEXT:    v_or3_b32 v3, v10, v18, v11
+; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v16i8_v_s:
@@ -6400,11 +6400,11 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    s_and_b32 s1, s2, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX8-NEXT:    v_mov_b32_e32 v7, 8
+; GFX8-NEXT:    v_mov_b32_e32 v9, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v9, s1
-; GFX8-NEXT:    v_mov_b32_e32 v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_lshr_b32 s4, s2, 2
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
@@ -6412,76 +6412,76 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    s_not_b32 s5, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v10
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v6
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v14
-; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
-; GFX8-NEXT:    v_or_b32_e32 v1, v5, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
-; GFX8-NEXT:    v_or_b32_e32 v5, v6, v18
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v9
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v11
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v6
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v0, vcc
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[2:3]
 ; GFX8-NEXT:    v_and_b32_e32 v5, s5, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[2:3]
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v9
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v9, v0, v13
-; GFX8-NEXT:    v_or_b32_e32 v10, v1, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; GFX8-NEXT:    v_or_b32_e32 v8, v2, v8
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v13
+; GFX8-NEXT:    v_or_b32_e32 v11, v0, v15
+; GFX8-NEXT:    v_or_b32_e32 v12, v1, v17
+; GFX8-NEXT:    v_or_b32_e32 v10, v2, v10
 ; GFX8-NEXT:    v_or_b32_e32 v0, v3, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v9, v5
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_or_b32_e32 v2, v10, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v7
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_or_b32_e32 v1, v11, v5
+; GFX8-NEXT:    v_or_b32_e32 v2, v12, v6
+; GFX8-NEXT:    v_or_b32_e32 v3, v10, v9
+; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: insertelement_v_v16i8_v_s:
@@ -6650,29 +6650,29 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v10, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v1, v3, s3, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s3, v12
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s3, v14
-; GFX10-NEXT:    v_and_or_b32 v8, v2, s3, v0
+; GFX10-NEXT:    v_and_or_b32 v6, v4, s3, v12
+; GFX10-NEXT:    v_and_or_b32 v10, v5, s3, v14
+; GFX10-NEXT:    v_and_or_b32 v12, v2, s3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX10-NEXT:    v_or3_b32 v0, v1, v11, v3
-; GFX10-NEXT:    v_or3_b32 v1, v4, v13, v6
-; GFX10-NEXT:    v_or3_b32 v2, v5, v15, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_or3_b32 v3, v8, v16, v9
+; GFX10-NEXT:    v_or3_b32 v0, v1, v11, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_or3_b32 v1, v6, v13, v7
+; GFX10-NEXT:    v_or3_b32 v2, v10, v15, v8
+; GFX10-NEXT:    v_or3_b32 v3, v12, v16, v9
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
@@ -6686,169 +6686,169 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
-; GFX9-NEXT:    s_mov_b32 s1, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
-; GFX9-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-NEXT:    s_mov_b32 s1, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 16
+; GFX9-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v6, v6, v0, v17
+; GFX9-NEXT:    v_and_or_b32 v17, v7, v0, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 2, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s2, v15
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 2, v3
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s2, v13
+; GFX9-NEXT:    v_and_or_b32 v13, v4, s2, v13
+; GFX9-NEXT:    v_and_or_b32 v15, v5, s2, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v6, v6, v0, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GFX9-NEXT:    v_or3_b32 v4, v4, v14, v9
-; GFX9-NEXT:    v_or3_b32 v5, v5, v16, v10
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT:    v_and_or_b32 v13, v7, v0, v19
+; GFX9-NEXT:    v_or3_b32 v9, v13, v14, v9
+; GFX9-NEXT:    v_or3_b32 v10, v15, v16, v10
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX9-NEXT:    v_or3_b32 v6, v6, v18, v11
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; GFX9-NEXT:    v_or3_b32 v6, v6, v18, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v4, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v9, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v0
-; GFX9-NEXT:    v_or3_b32 v7, v13, v7, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v15
+; GFX9-NEXT:    v_or3_b32 v7, v17, v7, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v19
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v7, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v2, v9, v3, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v2, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v7, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v2, v11, v3, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_or_b32 v4, v4, v0, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NEXT:    v_and_or_b32 v5, v5, v0, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v11
-; GFX9-NEXT:    v_and_or_b32 v11, v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v16, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v13
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v9, v9, v0, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX9-NEXT:    v_or3_b32 v1, v4, v14, v7
-; GFX9-NEXT:    v_or3_b32 v2, v5, v16, v9
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_or3_b32 v0, v3, v12, v6
-; GFX9-NEXT:    v_or3_b32 v3, v11, v8, v10
+; GFX9-NEXT:    v_and_or_b32 v13, v2, v0, v1
+; GFX9-NEXT:    v_and_or_b32 v6, v6, v0, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX9-NEXT:    v_or3_b32 v0, v3, v14, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_or3_b32 v1, v9, v16, v10
+; GFX9-NEXT:    v_or3_b32 v2, v6, v18, v11
+; GFX9-NEXT:    v_or3_b32 v3, v13, v8, v12
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: insertelement_v_v16i8_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 2, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 8
+; GFX8-NEXT:    v_mov_b32_e32 v1, 8
+; GFX8-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0xff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v10, 16
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v11
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v11
-; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v6
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v17, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 2, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v7
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v5
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v19, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v7
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v18, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX8-NEXT:    v_or_b32_sdwa v15, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v18
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v12
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v13
-; GFX8-NEXT:    v_or_b32_e32 v5, v6, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v4, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v0, v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v14
+; GFX8-NEXT:    v_or_b32_e32 v14, v15, v16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v18
+; GFX8-NEXT:    v_or_b32_e32 v3, v14, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
+; GFX8-NEXT:    v_or_b32_e32 v7, v17, v7
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v3, v1, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v19
+; GFX8-NEXT:    v_or_b32_e32 v7, v7, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v6, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v19
+; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v7, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v0, v8, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v0, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, v0, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v0, s[2:3]
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v0, s[2:3]
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v11
-; GFX8-NEXT:    v_or_b32_e32 v8, v0, v10
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v15
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v6
-; GFX8-NEXT:    v_or_b32_e32 v3, v8, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v13
+; GFX8-NEXT:    v_or_b32_e32 v10, v0, v10
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v6
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v3, v10, v9
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -7014,35 +7014,35 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v7, s1
 ; GFX10-NEXT:    v_and_or_b32 v0, v10, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v0, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v0, s1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 24, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v16, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_or_b32 v2, v2, v1, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GFX10-NEXT:    v_and_or_b32 v4, v4, v1, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v3, v3, v1, v13
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v5
+; GFX10-NEXT:    v_and_or_b32 v13, v4, v1, v15
 ; GFX10-NEXT:    v_and_or_b32 v8, v0, v1, v8
-; GFX10-NEXT:    v_and_or_b32 v3, v3, v1, v13
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; GFX10-NEXT:    v_or3_b32 v0, v2, v12, v5
-; GFX10-NEXT:    v_or3_b32 v2, v4, v16, v7
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_or3_b32 v0, v2, v12, v11
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v14, v6
-; GFX10-NEXT:    v_or3_b32 v3, v8, v9, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_or3_b32 v2, v13, v16, v7
+; GFX10-NEXT:    v_or3_b32 v3, v8, v9, v10
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_endpgm
   %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
index abd6f282e3fb..e9a68d3250bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
@@ -799,12 +799,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #0 {
 ; CI-LABEL: flat_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -814,12 +814,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #0 {
 ; VI-LABEL: flat_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -848,14 +848,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr)
 ; CI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -865,14 +865,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr)
 ; VI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -904,23 +904,23 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64* %ptr) nounwind {
 ; CI-LABEL: flat_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: flat_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: flat_atomic_dec_noret_i64:
 ; GFX9:       ; %bb.0:
@@ -940,27 +940,27 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64* %ptr) nounwind
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset:
 ; GFX9:       ; %bb.0:
@@ -987,17 +987,17 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
-; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v3
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; CI-NEXT:    v_mov_b32_e32 v4, 42
-; CI-NEXT:    v_mov_b32_e32 v5, 0
-; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
@@ -1009,17 +1009,17 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
-; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v3
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v4, 42
-; VI-NEXT:    v_mov_b32_e32 v5, 0
-; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -1064,14 +1064,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 0
-; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
@@ -1080,14 +1080,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
@@ -1358,12 +1358,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64
 ; CI-LABEL: global_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1373,12 +1373,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64
 ; VI-LABEL: global_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1407,14 +1407,14 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %o
 ; CI-LABEL: global_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1424,14 +1424,14 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %o
 ; VI-LABEL: global_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1463,23 +1463,23 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) n
 ; CI-LABEL: global_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: global_atomic_dec_noret_i64:
 ; GFX9:       ; %bb.0:
@@ -1499,27 +1499,27 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)*
 ; CI-LABEL: global_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: global_atomic_dec_noret_i64_offset:
 ; GFX9:       ; %bb.0:
@@ -1546,17 +1546,17 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
-; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v3
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; CI-NEXT:    v_mov_b32_e32 v4, 42
-; CI-NEXT:    v_mov_b32_e32 v5, 0
-; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
@@ -1568,17 +1568,17 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
-; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v3
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v4, 42
-; VI-NEXT:    v_mov_b32_e32 v5, 0
-; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -1623,14 +1623,14 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 0
-; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64:
@@ -1639,14 +1639,14 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index 062c0ad91ea9..a68602a17023 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -859,12 +859,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64
 ; CI-LABEL: global_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -874,12 +874,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64
 ; VI-LABEL: global_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -918,14 +918,14 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
 ; CI-LABEL: global_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -935,14 +935,14 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
 ; VI-LABEL: global_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -982,23 +982,23 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) n
 ; CI-LABEL: global_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_inc_noret_i64:
@@ -1028,27 +1028,27 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)*
 ; CI-LABEL: global_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_inc_noret_i64_offset:
@@ -1083,17 +1083,17 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
-; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v3
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; CI-NEXT:    v_mov_b32_e32 v4, 42
-; CI-NEXT:    v_mov_b32_e32 v5, 0
-; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
@@ -1105,17 +1105,17 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
-; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v3
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v4, 42
-; VI-NEXT:    v_mov_b32_e32 v5, 0
-; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -1123,25 +1123,25 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:40 glc
+; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 42
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 42
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:40 glc
+; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
@@ -1159,14 +1159,14 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 0
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64:
@@ -1175,34 +1175,34 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc
+; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v0, v[1:2], s[0:1] offset:40 glc
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 42
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 42
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc
+; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v0, v[1:2], s[0:1] offset:40 glc
 ; GFX10-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
@@ -1560,16 +1560,16 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 ;
 ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 9
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 9
+; GFX9-NEXT:    v_add_u32_e32 v3, 2, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v0, v[1:2] offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v3, v2, s[2:3]
-; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v3, s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: atomic_inc_shl_base_lds_0_i64:
@@ -1598,12 +1598,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64* %out, i64* %ptr) #0 {
 ; GCN-LABEL: flat_atomic_inc_ret_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, 42
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, 42
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1618,14 +1618,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr)
 ; CI-LABEL: flat_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1635,14 +1635,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr)
 ; VI-LABEL: flat_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1667,14 +1667,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr)
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 42
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s2, s2, 32
 ; GFX10-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1690,12 +1690,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64* %ptr) nounwind {
 ; GCN-LABEL: flat_atomic_inc_noret_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, 42
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, 42
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -1705,27 +1705,27 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind
 ; CI-LABEL: flat_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
@@ -1742,14 +1742,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 42
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s0, s0, 32
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_endpgm
   %gep = getelementptr i64, i64* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -1764,17 +1764,17 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_add_i32_e32 v3, vcc, v0, v2
-; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v3
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; CI-NEXT:    v_mov_b32_e32 v4, 42
-; CI-NEXT:    v_mov_b32_e32 v5, 0
-; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
@@ -1786,17 +1786,17 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v0, v2
-; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v3
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v4, 42
-; VI-NEXT:    v_mov_b32_e32 v5, 0
-; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -1804,18 +1804,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 42
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 42
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] offset:40 glc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1825,16 +1825,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 42
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 40
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 40
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -1858,14 +1858,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 0
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
@@ -1874,28 +1874,28 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:40 glc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
@@ -1905,13 +1905,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, 42
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 40
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 40
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GFX10-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64* %ptr, i32 %id

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 29e75289ac95..d8c6accc28ed 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -243,8 +243,8 @@ define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
 
 ; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
 ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
 define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32*
@@ -296,9 +296,9 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
 ; CI: enable_sgpr_queue_ptr = 1
 ; GFX9: enable_sgpr_queue_ptr = 0
 
-; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
 define amdgpu_kernel void @cast_neg1_private_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(5)* inttoptr (i32 -1 to i32 addrspace(5)*) to i32*

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 6efd94aa201e..d3c2ef94c923 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -3381,8 +3381,8 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
 ; GFX9-NEXT:    s_cbranch_execz BB18_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3415,8 +3415,8 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_cbranch_execz BB18_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1064-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
@@ -3448,8 +3448,8 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_cbranch_execz BB18_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1032-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
@@ -3798,8 +3798,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
 ; GFX9-NEXT:    s_cbranch_execz BB20_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3832,8 +3832,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_cbranch_execz BB20_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1064-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
@@ -3865,8 +3865,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_cbranch_execz BB20_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1032-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
@@ -4209,8 +4209,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
 ; GFX9-NEXT:    s_cbranch_execz BB22_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4242,8 +4242,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_cbranch_execz BB22_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1064-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
@@ -4275,8 +4275,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_cbranch_execz BB22_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1032-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
@@ -4619,8 +4619,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
 ; GFX9-NEXT:    s_cbranch_execz BB24_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4652,8 +4652,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    s_cbranch_execz BB24_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1064-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
@@ -4685,8 +4685,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    s_cbranch_execz BB24_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1032-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index ce4cbb3b643e..70e47dd33838 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -224,10 +224,10 @@ define <2 x half> @chain_hi_to_lo_global() {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    global_load_ushort v0, v[0:1], off
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_short_d16_hi v0, v[1:2], off
+; GCN-NEXT:    global_load_short_d16_hi v0, v[2:3], off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -292,10 +292,10 @@ define <2 x half> @chain_hi_to_lo_flat() {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_ushort v0, v[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    flat_load_short_d16_hi v0, v[1:2]
+; GCN-NEXT:    flat_load_short_d16_hi v0, v[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 9dcffcdb7ca1..0d186f0cc03b 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -67,14 +67,14 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 vcc, 0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX9-NEXT:    ds_write_b32 v0, v1 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    ds_write_b32 v3, v4 offset:12
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -84,11 +84,11 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    ds_write_b32 v3, v2 offset:12
+; GFX10-NEXT:    ds_write_b32 v2, v3 offset:12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
 ; GFX10-NEXT:    global_store_dword v[0:1], v4, off
@@ -315,15 +315,15 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 vcc, 0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0x3fb, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_sub_u32_e32 v3, 0x3fb, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX9-NEXT:    ds_write2_b32 v0, v1, v3 offset1:1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    ds_write2_b32 v3, v4, v5 offset1:1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -333,12 +333,12 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 0x3fb, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0x3fb, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    ds_write2_b32 v4, v2, v3 offset1:1
+; GFX10-NEXT:    ds_write2_b32 v2, v3, v4 offset1:1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_div_fmas_f32 v5, s0, s0, s0
 ; GFX10-NEXT:    global_store_dword v[0:1], v5, off

diff  --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index dad395a30e05..e4bc39d7f7f7 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -85,14 +85,14 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-NEXT:    s_cbranch_vccnz BB1_2
 ; GFX7-NEXT:  ; %bb.1: ; %bb0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:  BB1_2: ; %bb1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_endpgm
@@ -112,14 +112,14 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX9-NEXT:    s_cbranch_vccnz BB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  BB1_2: ; %bb1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -138,14 +138,14 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX10-NEXT:    s_cbranch_vccnz BB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %bb0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:  BB1_2: ; %bb1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index fcd3346d7dd9..f9a4c0d0a364 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -341,8 +341,8 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %pt
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
 
 ; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
@@ -354,8 +354,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
 ; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
@@ -367,8 +367,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
@@ -378,8 +378,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) n
 
 ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
 ; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index dd6d780a6eab..7904194e9034 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -190,8 +190,8 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %pt
 
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
 ; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
@@ -201,9 +201,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64
 }
 
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
 ; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
@@ -215,8 +215,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
 ; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}}
@@ -227,8 +227,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) n
 
 ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
 ; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {

diff  --git a/llvm/test/CodeGen/AMDGPU/remat-vop.mir b/llvm/test/CodeGen/AMDGPU/remat-vop.mir
index 458964ea84cb..5a468b5262aa 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-vop.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-vop.mir
@@ -93,26 +93,17 @@ body:             |
     S_NOP 0, implicit %2
     S_ENDPGM 0
 ...
-# FIXME: V_MOV_B64_PSEUDO is not rematerializable
 ---
 name:            test_remat_v_mov_b64_pseudo
 tracksRegLiveness: true
-machineFunctionInfo:
-  stackPtrOffsetReg:  $sgpr32
 body:             |
   bb.0:
     ; GCN-LABEL: name: test_remat_v_mov_b64_pseudo
     ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1, implicit $exec
-    ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5)
-    ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 2, implicit $exec
-    ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
-    ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 3, implicit $exec
-    ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.2, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.2, align 4, addrspace 5)
-    ; GCN: renamable $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5)
     ; GCN: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN: renamable $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 2, implicit $exec
     ; GCN: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
-    ; GCN: renamable $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.2, align 4, addrspace 5)
+    ; GCN: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 3, implicit $exec
     ; GCN: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
     ; GCN: S_ENDPGM 0
     %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 50b1ceac35ec..aed5243ab2da 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -433,10 +433,10 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[16:17], v[7:8]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[9:10], v0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v2
@@ -446,8 +446,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_not_b32_e32 v11, v18
 ; GCN-IR-NEXT:    v_add_i32_e32 v13, vcc, v10, v14
 ; GCN-IR-NEXT:    v_mov_b32_e32 v18, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v14, vcc, v11, v15, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v14, vcc, v11, v15, vcc
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
@@ -471,14 +471,14 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v14, v19
 ; GCN-IR-NEXT:    v_mov_b32_e32 v19, v12
 ; GCN-IR-NEXT:    v_subb_u32_e64 v17, s[4:5], v17, v20, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v18, v11
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB1_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB1_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[7:8], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v3
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v11, v2
@@ -1503,10 +1503,10 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], 24, v4
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB11_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_lshr_b64 v[12:13], 24, v8
@@ -1514,8 +1514,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, 58, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, 0, v11, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, 0, v11, vcc
 ; GCN-IR-NEXT:  BB11_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
@@ -1539,14 +1539,14 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, v15
 ; GCN-IR-NEXT:    v_mov_b32_e32 v15, v7
 ; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v14, v6
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB11_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB11_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB11_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v0
@@ -1692,83 +1692,83 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
 ; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v10, v5, v4, vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v10
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v6
 ; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, s8
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, s8
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v8, v8, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v7
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[8:9], v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_lshr_b64 v[12:13], s[4:5], v8
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, 47, v10
+; GCN-IR-NEXT:    v_lshr_b64 v[12:13], s[4:5], v10
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, 0, v11, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:  BB12_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v8
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v8, v12
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v9, v13, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v10, v12
+; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v11, v13, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v14, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v6
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
 ; GCN-IR-NEXT:    v_and_b32_e32 v17, v14, v0
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v14
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v14
 ; GCN-IR-NEXT:    v_and_b32_e32 v16, v14, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v6
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v15, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v11, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v14
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v14
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v17
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v15
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, v9
 ; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, v6
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, v8
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB12_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB12_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB12_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v9, v9, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v0
 ; GCN-IR-NEXT:  BB12_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v6, v2
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v7, v3
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v8, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v9, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
@@ -1820,10 +1820,10 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e64 v3, s[4:5], 63, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[7:8], v3
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB13_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_lshr_b64 v[9:10], v[7:8], v9
@@ -1855,14 +1855,14 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v9, s[4:5], v0, v9
 ; GCN-IR-NEXT:    v_subb_u32_e64 v10, s[4:5], v10, v13, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB13_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB13_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB13_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[3:4], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index acadd3b5da44..035168446c91 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -282,10 +282,10 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v6, s4
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %shift = ashr i128 %lhs, %rhs
@@ -587,8 +587,9 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x8
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v10, 16
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s6, 64, s16
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
@@ -643,11 +644,10 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v6, s4
 ; GCN-NEXT:    v_mov_b32_e32 v7, s3
-; GCN-NEXT:    v_mov_b32_e32 v10, s2
+; GCN-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v6, v7, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v10, 16
-; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 9b0c2abaedf0..a8874203b76a 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -408,52 +408,52 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[9:10], v[7:8]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[0:1], v3
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, -1, v5
 ; GCN-IR-NEXT:    v_lshr_b64 v[16:17], v[0:1], v9
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v6, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v10, v12
+; GCN-IR-NEXT:    v_mov_b32_e32 v18, 0
 ; GCN-IR-NEXT:    v_not_b32_e32 v11, v13
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, v10, v14
+; GCN-IR-NEXT:    v_mov_b32_e32 v19, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, v11, v15, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[14:15], v[16:17], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v10, 31, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v16, v16, v10
+; GCN-IR-NEXT:    v_or_b32_e32 v14, v14, v10
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v3, v16
-; GCN-IR-NEXT:    v_subb_u32_e32 v10, vcc, v9, v17, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v14, v7
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v10
-; GCN-IR-NEXT:    v_and_b32_e32 v19, v14, v5
-; GCN-IR-NEXT:    v_and_b32_e32 v10, 1, v14
-; GCN-IR-NEXT:    v_and_b32_e32 v18, v14, v6
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v12
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v15, v8
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v13, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[12:13]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v14
+; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v3, v14
+; GCN-IR-NEXT:    v_subb_u32_e32 v10, vcc, v9, v15, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v18, v7
+; GCN-IR-NEXT:    v_add_i32_e32 v18, vcc, 1, v12
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v16, 31, v10
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v19, v8
+; GCN-IR-NEXT:    v_addc_u32_e32 v19, vcc, 0, v13, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[18:19], v[12:13]
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v18
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_sub_i32_e64 v16, s[4:5], v16, v19
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, v11
-; GCN-IR-NEXT:    v_subb_u32_e64 v17, s[4:5], v17, v18, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, v10
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_and_b32_e32 v10, 1, v16
+; GCN-IR-NEXT:    v_and_b32_e32 v17, v16, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v16, v16, v5
+; GCN-IR-NEXT:    v_sub_i32_e64 v16, s[4:5], v14, v16
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v19
+; GCN-IR-NEXT:    v_mov_b32_e32 v19, v11
+; GCN-IR-NEXT:    v_subb_u32_e64 v17, s[4:5], v15, v17, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v18, v10
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB1_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB1_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v8
 ; GCN-IR-NEXT:    v_or_b32_e32 v9, v10, v7
@@ -1682,10 +1682,10 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[3:4]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], 24, v2
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB11_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], 24, v6
@@ -1693,8 +1693,8 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 58, v8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:  BB11_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
@@ -1718,14 +1718,14 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v13
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, v5
 ; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB11_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB11_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB11_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v4, v2
@@ -1870,84 +1870,84 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v8, v3, v2, vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v4
 ; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, s8
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3]
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v6
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 47, v8
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:  BB12_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v6, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v7, v11, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v8, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v9, v11, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v12, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
 ; GCN-IR-NEXT:    v_and_b32_e32 v15, v12, v0
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
 ; GCN-IR-NEXT:    v_and_b32_e32 v14, v12, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v13, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v12
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
 ; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB12_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB12_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB12_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
 ; GCN-IR-NEXT:  BB12_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v5
-; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v4
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v7
+; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v6
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v6
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v6
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
@@ -2003,10 +2003,10 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB13_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
@@ -2038,14 +2038,14 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v15
 ; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB13_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB13_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB13_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v5
 ; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index f4d7bdf60102..bfd75635175c 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -374,52 +374,52 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[12:13], v[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v3, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v6, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
 ; GCN-IR-NEXT:    v_not_b32_e32 v7, v9
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, v7, v11, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[12:13], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v0, v12
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v1, v13, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v10, v3
-; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v10
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v0, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v1, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v14, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v8
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v15, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v14
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v11
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
-; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v13, v14, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v2
+; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v10, v12
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v15
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB1_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB1_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v7, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v6, v0
@@ -1244,77 +1244,77 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v8, v3, v2, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffd0, v8
-; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v5, vcc, 0xffffffd0, v4
+; GCN-IR-NEXT:    v_addc_u32_e64 v6, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[5:6]
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[5:6]
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v7
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB9_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[4:5]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v5
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[5:6]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v6
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 47, v8
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[12:13], s[4:5], v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, 47, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v6, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v7, v11, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v12, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v12, v0
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v12, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v13, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v5
-; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e32 v5, vcc, v8, v4
+; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, v9, v13, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v14, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v7, 31, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v15, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v11, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v14
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT:    v_and_b32_e32 v5, 1, v7
+; GCN-IR-NEXT:    v_and_b32_e32 v16, v7, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v7, v7, v0
+; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v4, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v15
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, v6
+; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, v5
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB9_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB9_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB9_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v5, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v4, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v6, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v0
 ; GCN-IR-NEXT:  BB9_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v2
@@ -1359,10 +1359,10 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB10_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_lshr_b64 v[7:8], v[0:1], v7
@@ -1394,14 +1394,14 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], v7, v11
 ; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], v8, v6, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB10_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB10_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB10_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
@@ -1736,17 +1736,17 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_lshr_b64 v[7:8], v[0:1], v7
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc4, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
 ; GCN-IR-NEXT:  BB12_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
@@ -1769,14 +1769,14 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, v5
 ; GCN-IR-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB12_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB12_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB12_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index a5b2cc055768..bcb04427e8f8 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -383,52 +383,52 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[7:8], v[5:6]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, -1, v2
 ; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[0:1], v7
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v8, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-IR-NEXT:    v_not_b32_e32 v9, v11
 ; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v8, v12
+; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, v9, v13, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[14:15], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v14, v14, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v8
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v6, v14
-; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v7, v15, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v8
-; GCN-IR-NEXT:    v_and_b32_e32 v17, v12, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v16, v12, v3
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v10
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v11, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v12
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v6, v12
+; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v7, v13, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v16, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, 1, v10
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v17, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, 0, v11, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v16
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_sub_i32_e64 v14, s[4:5], v14, v17
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v9
-; GCN-IR-NEXT:    v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v8
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v14
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v2
+; GCN-IR-NEXT:    v_sub_i32_e64 v14, s[4:5], v12, v14
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v17
+; GCN-IR-NEXT:    v_mov_b32_e32 v17, v9
+; GCN-IR-NEXT:    v_subb_u32_e64 v15, s[4:5], v13, v15, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v16, v8
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB1_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB1_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v7, v9, v5
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v8, v4
@@ -1263,84 +1263,84 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v8, v3, v2, vcc
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v4
 ; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, s8
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB8_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3]
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB8_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v6
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 47, v8
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:  BB8_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v6, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v7, v11, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v8, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v9, v11, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v12, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
 ; GCN-IR-NEXT:    v_and_b32_e32 v15, v12, v0
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
 ; GCN-IR-NEXT:    v_and_b32_e32 v14, v12, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v13, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v12
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
 ; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB8_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB8_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB8_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
 ; GCN-IR-NEXT:  BB8_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v5
-; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v4
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v7
+; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v6
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v6
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v6
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
@@ -1385,10 +1385,10 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
@@ -1420,14 +1420,14 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v13
 ; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v12, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB9_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:  BB9_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT:  BB9_5: ; %Flow3
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index b90148f5a5a1..e17c98ab7c45 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -239,11 +239,11 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg)
 ; VI-NEXT:    s_add_i32 s1, s1, 12
 ; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; VI-NEXT:    s_or_b32 s0, s1, 4
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %load = load <2 x i8>, <2 x i8> addrspace(4)* %arg, align 4
@@ -283,10 +283,10 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v0, 0x3e7, v0
-; VI-NEXT:    v_or_b32_e32 v2, 4, v0
+; VI-NEXT:    v_add_u16_e32 v2, 0x3e7, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_or_b32_e32 v2, 4, v2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()


        


More information about the llvm-commits mailing list