[llvm] 201f770 - GlobalISel: Add and_trivial_mask to all_combines

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 27 13:42:18 PDT 2020


Author: Matt Arsenault
Date: 2020-08-27T16:42:09-04:00
New Revision: 201f770f16e8ea89e92a1edd79c41559f4031187

URL: https://github.com/llvm/llvm-project/commit/201f770f16e8ea89e92a1edd79c41559f4031187
DIFF: https://github.com/llvm/llvm-project/commit/201f770f16e8ea89e92a1edd79c41559f4031187.diff

LOG: GlobalISel: Add and_trivial_mask to all_combines

Also make up a new category of combines.

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir

Modified: 
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index eb2810844637..f0645d8380d8 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -357,6 +357,8 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
                                         binop_right_to_zero, p2i_to_i2p,
                                         i2p_to_p2i]>;
 
+def known_bits_simplifications : GICombineGroup<[and_trivial_mask]>;
+
 def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>;
 
 def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp]>;
@@ -367,4 +369,5 @@ def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain,
     identity_combines, simplify_add_to_sub,
     hoist_logic_op_with_same_opcode_hands,
     shl_ashr_to_sext_inreg, sext_inreg_of_load,
-    width_reduction_combines, select_combines]>;
+    width_reduction_combines, select_combines,
+    known_bits_simplifications]>;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index 2205bfe3c71d..721269072fa8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -159,11 +159,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 ;
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s1, 0xffc0
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
@@ -186,10 +185,9 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 ;
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s2
 ; GFX8-NEXT:    s_add_i32 s0, s0, 0xffc0
 ; GFX8-NEXT:    s_add_i32 s1, s1, 4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
@@ -212,10 +210,9 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
 ;
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s2, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s2
 ; GFX8-NEXT:    s_add_i32 s0, s0, 4
 ; GFX8-NEXT:    s_add_i32 s1, s1, 0xffc0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
@@ -239,13 +236,11 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
 ;
 ; GFX8-LABEL: s_add_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
-; GFX8-NEXT:    s_and_b32 s4, s4, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
@@ -271,13 +266,11 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
 ; GFX8-LABEL: s_add_v2i16_fneg_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
-; GFX8-NEXT:    s_and_b32 s4, s4, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
@@ -305,13 +298,11 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
 ; GFX8-LABEL: s_add_v2i16_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
-; GFX8-NEXT:    s_and_b32 s4, s4, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
@@ -343,13 +334,11 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
 ; GFX8-NEXT:    s_mov_b32 s2, 0x80008000
 ; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    s_xor_b32 s0, s0, s2
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
-; GFX8-NEXT:    s_and_b32 s4, s4, s3
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, 16

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index c1896f81ef29..12f4d441f7cc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -47,8 +47,8 @@ define i8 @v_ashr_i8_7(i8 %value) {
 ; GFX9-LABEL: v_ashr_i8_7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 7
-; GFX9-NEXT:    v_ashrrev_i16_sdwa v0, s4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 7
+; GFX9-NEXT:    v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i8 %value, 7
   ret i8 %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index a6c2524f1962..e3775b62a4d2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -514,7 +514,6 @@ define i64 @v_bswap_i48(i64 %src) {
 ; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; GFX7-NEXT:    v_bfi_b32 v2, s4, v0, v2
 ; GFX7-NEXT:    v_lshr_b64 v[0:1], v[1:2], 16
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_i48:
@@ -524,7 +523,6 @@ define i64 @v_bswap_i48(i64 %src) {
 ; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s4
 ; GFX8-NEXT:    v_perm_b32 v2, 0, v0, s4
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[1:2]
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_i48:
@@ -534,7 +532,6 @@ define i64 @v_bswap_i48(i64 %src) {
 ; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s4
 ; GFX9-NEXT:    v_perm_b32 v2, 0, v0, s4
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[1:2]
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %trunc = trunc i64 %src to i48
   %bswap = call i48 @llvm.bswap.i48(i48 %trunc)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 37ea037ab1a7..f0f0edcb57f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -224,30 +224,28 @@ define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_v4i8_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; SI-NEXT:    s_movk_i32 s4, 0xff
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; SI-NEXT:    v_and_b32_e32 v0, s4, v0
+; SI-NEXT:    v_and_b32_e32 v3, s4, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_and_b32_e32 v2, s4, v2
-; SI-NEXT:    v_and_b32_e32 v3, s4, v3
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v3
+; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
+; SI-NEXT:    v_mov_b32_e32 v0, v4
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_v4i8_to_v4f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_movk_i32 s4, 0xff
-; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; VI-NEXT:    v_and_b32_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT:    v_and_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
+; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index 95b4177abbca..cc383ff8806d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -19,8 +19,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(<4 x i8> addrspace(4)* in
 ; GCN-NEXT:    s_and_b32 s1, s2, s5
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s1, s3, s5
-; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_lshl_b32 s1, s3, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_and_b32 s1, s4, 3
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 3
@@ -40,11 +39,12 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(<4 x i8> addrspace(1)* %p
 ; GFX9-NEXT:    s_and_b32 s2, s2, 3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v1
-; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v2
 ; GFX9-NEXT:    s_lshl_b32 s0, s2, 3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -61,11 +61,12 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(<4 x i8> addrspace(1)* %p
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -86,7 +87,6 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(<4 x i8> addrspace(1)* %p
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -112,11 +112,12 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
-; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GFX9-NEXT:    v_or3_b32 v0, v0, v4, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -131,11 +132,12 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -158,7 +160,6 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
@@ -189,8 +190,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* in
 ; GFX9-NEXT:    s_and_b32 s1, s2, s4
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s3, s4
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -213,8 +213,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* in
 ; GFX8-NEXT:    s_and_b32 s1, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, s4
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
@@ -237,8 +236,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* in
 ; GFX7-NEXT:    s_and_b32 s1, s2, s4
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s3, s4
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    v_lshr_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
@@ -259,14 +257,13 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(<4 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_lshr_b32 s3, s1, 16
 ; GCN-NEXT:    s_lshr_b32 s4, s1, 24
 ; GCN-NEXT:    s_and_b32 s1, s1, s0
+; GCN-NEXT:    s_and_b32 s0, s3, s0
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 8
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
-; GCN-NEXT:    s_and_b32 s2, s3, s0
-; GCN-NEXT:    s_and_b32 s0, s4, s0
-; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    s_or_b32 s1, s1, s2
-; GCN-NEXT:    s_lshl_b32 s0, s0, 24
+; GCN-NEXT:    s_lshl_b32 s0, s0, 16
 ; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshl_b32 s1, s4, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
   %element = extractelement <4 x i8> %vector, i32 0
@@ -284,14 +281,13 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(<4 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_lshr_b32 s3, s1, 16
 ; GCN-NEXT:    s_lshr_b32 s4, s1, 24
 ; GCN-NEXT:    s_and_b32 s1, s1, s0
+; GCN-NEXT:    s_and_b32 s0, s3, s0
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 8
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
-; GCN-NEXT:    s_and_b32 s2, s3, s0
-; GCN-NEXT:    s_and_b32 s0, s4, s0
-; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    s_or_b32 s1, s1, s2
-; GCN-NEXT:    s_lshl_b32 s0, s0, 24
+; GCN-NEXT:    s_lshl_b32 s0, s0, 16
 ; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshl_b32 s1, s4, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 8
 ; GCN-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
@@ -310,14 +306,13 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(<4 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_lshr_b32 s3, s1, 16
 ; GCN-NEXT:    s_lshr_b32 s4, s1, 24
 ; GCN-NEXT:    s_and_b32 s1, s1, s0
+; GCN-NEXT:    s_and_b32 s0, s3, s0
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 8
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
-; GCN-NEXT:    s_and_b32 s2, s3, s0
-; GCN-NEXT:    s_and_b32 s0, s4, s0
-; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    s_or_b32 s1, s1, s2
-; GCN-NEXT:    s_lshl_b32 s0, s0, 24
+; GCN-NEXT:    s_lshl_b32 s0, s0, 16
 ; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshl_b32 s1, s4, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 16
 ; GCN-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
@@ -336,14 +331,13 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(<4 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_lshr_b32 s3, s1, 16
 ; GCN-NEXT:    s_lshr_b32 s4, s1, 24
 ; GCN-NEXT:    s_and_b32 s1, s1, s0
+; GCN-NEXT:    s_and_b32 s0, s3, s0
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 8
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
-; GCN-NEXT:    s_and_b32 s2, s3, s0
-; GCN-NEXT:    s_and_b32 s0, s4, s0
-; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    s_or_b32 s1, s1, s2
-; GCN-NEXT:    s_lshl_b32 s0, s0, 24
+; GCN-NEXT:    s_lshl_b32 s0, s0, 16
 ; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshl_b32 s1, s4, 24
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 24
 ; GCN-NEXT:    ; return to shader part epilog
   %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr
@@ -360,11 +354,12 @@ define i8 @extractelement_vgpr_v4i8_idx0(<4 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_idx0:
@@ -377,11 +372,12 @@ define i8 @extractelement_vgpr_v4i8_idx0(<4 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_idx0:
@@ -396,11 +392,10 @@ define i8 @extractelement_vgpr_v4i8_idx0(<4 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -421,11 +416,12 @@ define i8 @extractelement_vgpr_v4i8_idx1(<4 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -439,11 +435,12 @@ define i8 @extractelement_vgpr_v4i8_idx1(<4 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -459,11 +456,10 @@ define i8 @extractelement_vgpr_v4i8_idx1(<4 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -485,11 +481,12 @@ define i8 @extractelement_vgpr_v4i8_idx2(<4 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -503,11 +500,12 @@ define i8 @extractelement_vgpr_v4i8_idx2(<4 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -523,11 +521,10 @@ define i8 @extractelement_vgpr_v4i8_idx2(<4 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -549,11 +546,12 @@ define i8 @extractelement_vgpr_v4i8_idx3(<4 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -567,11 +565,12 @@ define i8 @extractelement_vgpr_v4i8_idx3(<4 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -587,11 +586,10 @@ define i8 @extractelement_vgpr_v4i8_idx3(<4 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -620,8 +618,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in
 ; GCN-NEXT:    s_and_b32 s2, s3, s9
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s2
-; GCN-NEXT:    s_and_b32 s2, s5, s9
-; GCN-NEXT:    s_lshl_b32 s2, s2, 24
+; GCN-NEXT:    s_lshl_b32 s2, s5, 24
 ; GCN-NEXT:    s_lshr_b32 s6, s1, 8
 ; GCN-NEXT:    s_or_b32 s0, s0, s2
 ; GCN-NEXT:    s_and_b32 s2, s6, s9
@@ -633,8 +630,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in
 ; GCN-NEXT:    s_and_b32 s2, s7, s9
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
-; GCN-NEXT:    s_and_b32 s2, s8, s9
-; GCN-NEXT:    s_lshl_b32 s2, s2, 24
+; GCN-NEXT:    s_lshl_b32 s2, s8, 24
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
 ; GCN-NEXT:    s_lshr_b32 s2, s4, 2
 ; GCN-NEXT:    s_cmp_eq_u32 s2, 1
@@ -653,24 +649,25 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX9-NEXT:    s_movk_i32 s1, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 2
 ; GFX9-NEXT:    s_and_b32 s2, s2, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s1, v2
-; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s1, v4
+; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v3
+; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    s_lshl_b32 s0, s2, 3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
@@ -690,19 +687,21 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX8-NEXT:    s_lshl_b32 s0, s1, 3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v9
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
@@ -730,11 +729,9 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
 ; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX7-NEXT:    v_and_b32_e32 v6, s0, v6
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
-; GFX7-NEXT:    v_and_b32_e32 v7, s0, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
@@ -761,24 +758,25 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_sdwa v9, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v10, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v3
-; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v8
-; GFX9-NEXT:    v_or3_b32 v1, v1, v9, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v6
+; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v5
+; GFX9-NEXT:    v_or3_b32 v1, v1, v9, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
@@ -797,19 +795,21 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v11, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v11
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v10
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
@@ -841,8 +841,6 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
-; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
@@ -881,8 +879,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in
 ; GCN-NEXT:    s_and_b32 s2, s3, s8
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s2
-; GCN-NEXT:    s_and_b32 s2, s4, s8
-; GCN-NEXT:    s_lshl_b32 s2, s2, 24
+; GCN-NEXT:    s_lshl_b32 s2, s4, 24
 ; GCN-NEXT:    s_lshr_b32 s5, s1, 8
 ; GCN-NEXT:    s_or_b32 s0, s0, s2
 ; GCN-NEXT:    s_and_b32 s2, s5, s8
@@ -894,8 +891,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in
 ; GCN-NEXT:    s_and_b32 s2, s6, s8
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
-; GCN-NEXT:    s_and_b32 s2, s7, s8
-; GCN-NEXT:    s_lshl_b32 s2, s2, 24
+; GCN-NEXT:    s_lshl_b32 s2, s7, 24
 ; GCN-NEXT:    s_or_b32 s1, s1, s2
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
@@ -925,8 +921,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(<8 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_and_b32 s1, s2, s4
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s1, s3, s4
-; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_lshl_b32 s1, s3, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@@ -950,8 +945,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(<8 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_and_b32 s1, s2, s4
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s1, s3, s4
-; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_lshl_b32 s1, s3, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 8
 ; GCN-NEXT:    ; return to shader part epilog
@@ -976,8 +970,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(<8 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_and_b32 s1, s2, s4
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s1, s3, s4
-; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_lshl_b32 s1, s3, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 16
 ; GCN-NEXT:    ; return to shader part epilog
@@ -1002,8 +995,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(<8 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_and_b32 s1, s2, s4
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s1, s3, s4
-; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_lshl_b32 s1, s3, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 24
 ; GCN-NEXT:    ; return to shader part epilog
@@ -1028,8 +1020,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(<8 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_and_b32 s1, s2, s4
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s1, s3, s4
-; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_lshl_b32 s1, s3, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    ; return to shader part epilog
   %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@@ -1053,8 +1044,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(<8 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_and_b32 s1, s2, s4
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s1, s3, s4
-; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_lshl_b32 s1, s3, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 8
 ; GCN-NEXT:    ; return to shader part epilog
@@ -1079,8 +1069,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(<8 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_and_b32 s1, s2, s4
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s1, s3, s4
-; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_lshl_b32 s1, s3, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 16
 ; GCN-NEXT:    ; return to shader part epilog
@@ -1105,8 +1094,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(<8 x i8> addrspace(4)* inreg
 ; GCN-NEXT:    s_and_b32 s1, s2, s4
 ; GCN-NEXT:    s_lshl_b32 s1, s1, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s1, s3, s4
-; GCN-NEXT:    s_lshl_b32 s1, s1, 24
+; GCN-NEXT:    s_lshl_b32 s1, s3, 24
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 24
 ; GCN-NEXT:    ; return to shader part epilog
@@ -1124,11 +1112,12 @@ define i8 @extractelement_vgpr_v8i8_idx0(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx0:
@@ -1141,11 +1130,12 @@ define i8 @extractelement_vgpr_v8i8_idx0(<8 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx0:
@@ -1160,11 +1150,10 @@ define i8 @extractelement_vgpr_v8i8_idx0(<8 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -1185,11 +1174,12 @@ define i8 @extractelement_vgpr_v8i8_idx1(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1203,11 +1193,12 @@ define i8 @extractelement_vgpr_v8i8_idx1(<8 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1223,11 +1214,10 @@ define i8 @extractelement_vgpr_v8i8_idx1(<8 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -1249,11 +1239,12 @@ define i8 @extractelement_vgpr_v8i8_idx2(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1267,11 +1258,12 @@ define i8 @extractelement_vgpr_v8i8_idx2(<8 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1287,11 +1279,10 @@ define i8 @extractelement_vgpr_v8i8_idx2(<8 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -1313,11 +1304,12 @@ define i8 @extractelement_vgpr_v8i8_idx3(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1331,11 +1323,12 @@ define i8 @extractelement_vgpr_v8i8_idx3(<8 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1351,11 +1344,10 @@ define i8 @extractelement_vgpr_v8i8_idx3(<8 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -1378,10 +1370,11 @@ define i8 @extractelement_vgpr_v8i8_idx4(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx4:
@@ -1394,10 +1387,11 @@ define i8 @extractelement_vgpr_v8i8_idx4(<8 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1413,11 +1407,10 @@ define i8 @extractelement_vgpr_v8i8_idx4(<8 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -1439,10 +1432,11 @@ define i8 @extractelement_vgpr_v8i8_idx5(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1456,10 +1450,11 @@ define i8 @extractelement_vgpr_v8i8_idx5(<8 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1476,11 +1471,10 @@ define i8 @extractelement_vgpr_v8i8_idx5(<8 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -1503,10 +1497,11 @@ define i8 @extractelement_vgpr_v8i8_idx6(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1520,10 +1515,11 @@ define i8 @extractelement_vgpr_v8i8_idx6(<8 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1540,11 +1536,10 @@ define i8 @extractelement_vgpr_v8i8_idx6(<8 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -1567,10 +1562,11 @@ define i8 @extractelement_vgpr_v8i8_idx7(<8 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1584,10 +1580,11 @@ define i8 @extractelement_vgpr_v8i8_idx7(<8 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1604,11 +1601,10 @@ define i8 @extractelement_vgpr_v8i8_idx7(<8 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -1637,8 +1633,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    s_and_b32 s5, s6, s17
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s5
-; GCN-NEXT:    s_and_b32 s5, s7, s17
-; GCN-NEXT:    s_lshl_b32 s5, s5, 24
+; GCN-NEXT:    s_lshl_b32 s5, s7, 24
 ; GCN-NEXT:    s_lshr_b32 s8, s1, 8
 ; GCN-NEXT:    s_or_b32 s0, s0, s5
 ; GCN-NEXT:    s_and_b32 s5, s8, s17
@@ -1650,8 +1645,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    s_and_b32 s5, s9, s17
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 16
 ; GCN-NEXT:    s_or_b32 s1, s1, s5
-; GCN-NEXT:    s_and_b32 s5, s10, s17
-; GCN-NEXT:    s_lshl_b32 s5, s5, 24
+; GCN-NEXT:    s_lshl_b32 s5, s10, 24
 ; GCN-NEXT:    s_lshr_b32 s11, s2, 8
 ; GCN-NEXT:    s_or_b32 s1, s1, s5
 ; GCN-NEXT:    s_and_b32 s5, s11, s17
@@ -1663,8 +1657,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    s_and_b32 s5, s12, s17
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 16
 ; GCN-NEXT:    s_or_b32 s2, s2, s5
-; GCN-NEXT:    s_and_b32 s5, s13, s17
-; GCN-NEXT:    s_lshl_b32 s5, s5, 24
+; GCN-NEXT:    s_lshl_b32 s5, s13, 24
 ; GCN-NEXT:    s_lshr_b32 s14, s3, 8
 ; GCN-NEXT:    s_or_b32 s2, s2, s5
 ; GCN-NEXT:    s_and_b32 s5, s14, s17
@@ -1676,8 +1669,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    s_and_b32 s5, s15, s17
 ; GCN-NEXT:    s_lshl_b32 s5, s5, 16
 ; GCN-NEXT:    s_or_b32 s3, s3, s5
-; GCN-NEXT:    s_and_b32 s5, s16, s17
-; GCN-NEXT:    s_lshl_b32 s5, s5, 24
+; GCN-NEXT:    s_lshl_b32 s5, s16, 24
 ; GCN-NEXT:    s_or_b32 s3, s3, s5
 ; GCN-NEXT:    s_lshr_b32 s5, s4, 2
 ; GCN-NEXT:    s_cmp_eq_u32 s5, 1
@@ -1702,38 +1694,42 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    s_movk_i32 s1, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
 ; GFX9-NEXT:    s_and_b32 s2, s2, 3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v10, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v11, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX9-NEXT:    v_and_b32_sdwa v14, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v6
-; GFX9-NEXT:    v_and_b32_sdwa v12, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v13, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s1, v7
-; GFX9-NEXT:    v_and_b32_sdwa v14, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v15, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v2, v2, s1, v8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v11
-; GFX9-NEXT:    v_or3_b32 v1, v1, v12, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s1, v8
+; GFX9-NEXT:    v_and_b32_sdwa v16, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_and_or_b32 v2, v2, s1, v10
+; GFX9-NEXT:    v_or3_b32 v0, v0, v14, v7
+; GFX9-NEXT:    v_or3_b32 v1, v1, v15, v9
+; GFX9-NEXT:    v_and_b32_sdwa v17, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_and_b32_sdwa v16, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v17, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
 ; GFX9-NEXT:    v_and_or_b32 v3, v3, v4, v5
-; GFX9-NEXT:    v_or3_b32 v2, v2, v14, v15
+; GFX9-NEXT:    v_or3_b32 v2, v2, v16, v11
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_or3_b32 v3, v3, v16, v17
+; GFX9-NEXT:    v_or3_b32 v3, v3, v17, v12
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    s_lshl_b32 s0, s2, 3
@@ -1754,37 +1750,41 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX8-NEXT:    s_and_b32 s1, s2, 3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v12, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v13, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
+; GFX8-NEXT:    v_and_b32_sdwa v16, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v14, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v14
-; GFX8-NEXT:    v_and_b32_sdwa v15, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v15
-; GFX8-NEXT:    v_and_b32_sdwa v17, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX8-NEXT:    v_and_b32_sdwa v7, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v6, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX8-NEXT:    v_and_b32_sdwa v17, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v15
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v12
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX8-NEXT:    s_lshl_b32 s0, s1, 3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
@@ -1816,27 +1816,24 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
 ; GFX7-NEXT:    v_and_b32_e32 v6, s0, v6
 ; GFX7-NEXT:    v_and_b32_e32 v9, s0, v9
-; GFX7-NEXT:    v_and_b32_e32 v11, v11, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, s0, v11
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v7, s0, v7
-; GFX7-NEXT:    v_and_b32_e32 v10, s0, v10
 ; GFX7-NEXT:    v_and_b32_e32 v12, v12, v4
 ; GFX7-NEXT:    v_and_b32_e32 v14, v14, v4
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v8
-; GFX7-NEXT:    v_and_b32_e32 v13, v13, v4
-; GFX7-NEXT:    v_and_b32_e32 v15, v15, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, v15, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
@@ -1844,20 +1841,19 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v11
-; GFX7-NEXT:    v_and_b32_e32 v4, v16, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v14
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v10
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 24, v16
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v13
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 2
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v15
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 3
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX7-NEXT:    s_lshl_b32 s0, s2, 3
@@ -1877,38 +1873,42 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX9-NEXT:    s_mov_b32 s4, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 8, v6
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v12, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v13, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v14, v4, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v15, v4, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
+; GFX9-NEXT:    v_and_b32_sdwa v16, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v17, v4, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v3, v3, s5, v8
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s5, v9
-; GFX9-NEXT:    v_and_b32_sdwa v16, v5, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v17, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s5, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v6
+; GFX9-NEXT:    v_and_b32_sdwa v18, v5, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v19, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v6, v0, v1
-; GFX9-NEXT:    v_or3_b32 v1, v3, v12, v13
-; GFX9-NEXT:    v_or3_b32 v3, v4, v14, v15
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s5, v10
+; GFX9-NEXT:    v_or3_b32 v1, v3, v16, v9
+; GFX9-NEXT:    v_or3_b32 v3, v4, v17, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s5, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_or3_b32 v4, v5, v16, v17
+; GFX9-NEXT:    v_or3_b32 v4, v5, v18, v13
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_or3_b32 v0, v0, v18, v19
+; GFX9-NEXT:    v_or3_b32 v0, v0, v19, v14
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
@@ -1929,36 +1929,40 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 8, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v14, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v5
+; GFX8-NEXT:    v_and_b32_sdwa v18, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v15, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v16, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v14
-; GFX8-NEXT:    v_and_b32_sdwa v8, v4, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
-; GFX8-NEXT:    v_and_b32_sdwa v17, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v18, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v17
-; GFX8-NEXT:    v_and_b32_sdwa v19, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 8, v6
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v8, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v18
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 24, v5
+; GFX8-NEXT:    v_and_b32_sdwa v19, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 24, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v19
+; GFX8-NEXT:    v_and_b32_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_and_b32_sdwa v0, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v19
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 24, v17
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v14
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v15
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
@@ -1991,27 +1995,24 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 8, v6
 ; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
 ; GFX7-NEXT:    v_and_b32_e32 v10, s4, v10
-; GFX7-NEXT:    v_and_b32_e32 v12, v12, v0
+; GFX7-NEXT:    v_and_b32_e32 v12, s4, v12
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 24, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
-; GFX7-NEXT:    v_and_b32_e32 v11, s4, v11
 ; GFX7-NEXT:    v_and_b32_e32 v13, v13, v0
 ; GFX7-NEXT:    v_and_b32_e32 v15, v15, v0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
 ; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v9
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX7-NEXT:    v_and_b32_e32 v14, v14, v0
-; GFX7-NEXT:    v_and_b32_e32 v16, v16, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 24, v6
 ; GFX7-NEXT:    v_and_b32_e32 v6, v6, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
@@ -2019,20 +2020,19 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX7-NEXT:    v_or_b32_e32 v4, v5, v12
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; GFX7-NEXT:    v_and_b32_e32 v0, v17, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v13
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v5, v6, v15
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v11
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 24, v17
+; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v14
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v18
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX7-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v16
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v18
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
@@ -2062,8 +2062,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    s_and_b32 s4, s5, s16
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 16
 ; GCN-NEXT:    s_or_b32 s0, s0, s4
-; GCN-NEXT:    s_and_b32 s4, s6, s16
-; GCN-NEXT:    s_lshl_b32 s4, s4, 24
+; GCN-NEXT:    s_lshl_b32 s4, s6, 24
 ; GCN-NEXT:    s_lshr_b32 s7, s1, 8
 ; GCN-NEXT:    s_or_b32 s0, s0, s4
 ; GCN-NEXT:    s_and_b32 s4, s7, s16
@@ -2075,8 +2074,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    s_and_b32 s4, s8, s16
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 16
 ; GCN-NEXT:    s_or_b32 s1, s1, s4
-; GCN-NEXT:    s_and_b32 s4, s9, s16
-; GCN-NEXT:    s_lshl_b32 s4, s4, 24
+; GCN-NEXT:    s_lshl_b32 s4, s9, 24
 ; GCN-NEXT:    s_lshr_b32 s10, s2, 8
 ; GCN-NEXT:    s_or_b32 s1, s1, s4
 ; GCN-NEXT:    s_and_b32 s4, s10, s16
@@ -2088,8 +2086,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    s_and_b32 s4, s11, s16
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 16
 ; GCN-NEXT:    s_or_b32 s2, s2, s4
-; GCN-NEXT:    s_and_b32 s4, s12, s16
-; GCN-NEXT:    s_lshl_b32 s4, s4, 24
+; GCN-NEXT:    s_lshl_b32 s4, s12, 24
 ; GCN-NEXT:    s_lshr_b32 s13, s3, 8
 ; GCN-NEXT:    s_or_b32 s2, s2, s4
 ; GCN-NEXT:    s_and_b32 s4, s13, s16
@@ -2101,8 +2098,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
 ; GCN-NEXT:    s_and_b32 s4, s14, s16
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 16
 ; GCN-NEXT:    s_or_b32 s3, s3, s4
-; GCN-NEXT:    s_and_b32 s4, s15, s16
-; GCN-NEXT:    s_lshl_b32 s4, s4, 24
+; GCN-NEXT:    s_lshl_b32 s4, s15, 24
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    s_or_b32 s3, s3, s4
@@ -2131,11 +2127,12 @@ define i8 @extractelement_vgpr_v16i8_idx0(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx0:
@@ -2148,11 +2145,12 @@ define i8 @extractelement_vgpr_v16i8_idx0(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx0:
@@ -2167,11 +2165,10 @@ define i8 @extractelement_vgpr_v16i8_idx0(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2192,11 +2189,12 @@ define i8 @extractelement_vgpr_v16i8_idx1(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2210,11 +2208,12 @@ define i8 @extractelement_vgpr_v16i8_idx1(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2230,11 +2229,10 @@ define i8 @extractelement_vgpr_v16i8_idx1(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2256,11 +2254,12 @@ define i8 @extractelement_vgpr_v16i8_idx2(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2274,11 +2273,12 @@ define i8 @extractelement_vgpr_v16i8_idx2(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2294,11 +2294,10 @@ define i8 @extractelement_vgpr_v16i8_idx2(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2320,11 +2319,12 @@ define i8 @extractelement_vgpr_v16i8_idx3(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2338,11 +2338,12 @@ define i8 @extractelement_vgpr_v16i8_idx3(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2358,11 +2359,10 @@ define i8 @extractelement_vgpr_v16i8_idx3(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2385,10 +2385,11 @@ define i8 @extractelement_vgpr_v16i8_idx4(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx4:
@@ -2401,10 +2402,11 @@ define i8 @extractelement_vgpr_v16i8_idx4(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2420,11 +2422,10 @@ define i8 @extractelement_vgpr_v16i8_idx4(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2446,10 +2447,11 @@ define i8 @extractelement_vgpr_v16i8_idx5(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2463,10 +2465,11 @@ define i8 @extractelement_vgpr_v16i8_idx5(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2483,11 +2486,10 @@ define i8 @extractelement_vgpr_v16i8_idx5(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2510,10 +2512,11 @@ define i8 @extractelement_vgpr_v16i8_idx6(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2527,10 +2530,11 @@ define i8 @extractelement_vgpr_v16i8_idx6(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2547,11 +2551,10 @@ define i8 @extractelement_vgpr_v16i8_idx6(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2574,10 +2577,11 @@ define i8 @extractelement_vgpr_v16i8_idx7(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
+; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2591,10 +2595,11 @@ define i8 @extractelement_vgpr_v16i8_idx7(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2611,11 +2616,10 @@ define i8 @extractelement_vgpr_v16i8_idx7(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2637,11 +2641,12 @@ define i8 @extractelement_vgpr_v16i8_idx8(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v2, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx8:
@@ -2654,10 +2659,11 @@ define i8 @extractelement_vgpr_v16i8_idx8(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2673,11 +2679,10 @@ define i8 @extractelement_vgpr_v16i8_idx8(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -2698,11 +2703,12 @@ define i8 @extractelement_vgpr_v16i8_idx9(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v2, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2716,10 +2722,11 @@ define i8 @extractelement_vgpr_v16i8_idx9(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2736,11 +2743,10 @@ define i8 @extractelement_vgpr_v16i8_idx9(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -2762,11 +2768,12 @@ define i8 @extractelement_vgpr_v16i8_idx10(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v2, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2780,10 +2787,11 @@ define i8 @extractelement_vgpr_v16i8_idx10(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2800,11 +2808,10 @@ define i8 @extractelement_vgpr_v16i8_idx10(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -2826,11 +2833,12 @@ define i8 @extractelement_vgpr_v16i8_idx11(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v3, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v2, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2844,10 +2852,11 @@ define i8 @extractelement_vgpr_v16i8_idx11(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2864,11 +2873,10 @@ define i8 @extractelement_vgpr_v16i8_idx11(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -2890,11 +2898,12 @@ define i8 @extractelement_vgpr_v16i8_idx12(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v3, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx12:
@@ -2907,10 +2916,11 @@ define i8 @extractelement_vgpr_v16i8_idx12(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
+; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2926,11 +2936,10 @@ define i8 @extractelement_vgpr_v16i8_idx12(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -2951,11 +2960,12 @@ define i8 @extractelement_vgpr_v16i8_idx13(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v3, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2969,10 +2979,11 @@ define i8 @extractelement_vgpr_v16i8_idx13(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
+; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2989,11 +3000,10 @@ define i8 @extractelement_vgpr_v16i8_idx13(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3015,11 +3025,12 @@ define i8 @extractelement_vgpr_v16i8_idx14(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v3, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3033,10 +3044,11 @@ define i8 @extractelement_vgpr_v16i8_idx14(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
+; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -3053,11 +3065,10 @@ define i8 @extractelement_vgpr_v16i8_idx14(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3079,11 +3090,12 @@ define i8 @extractelement_vgpr_v16i8_idx15(<16 x i8> addrspace(1)* %ptr) {
 ; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v0, v3, s5, v0
-; GFX9-NEXT:    v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3097,10 +3109,11 @@ define i8 @extractelement_vgpr_v16i8_idx15(<16 x i8> addrspace(1)* %ptr) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
+; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -3117,11 +3130,10 @@ define i8 @extractelement_vgpr_v16i8_idx15(<16 x i8> addrspace(1)* %ptr) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 7c3e74dfcf69..2951bb86a16f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -596,18 +596,20 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v3
-; GFX9-NEXT:    v_or3_b32 v0, v0, v4, v5
+; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v4
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v1
-; GFX9-NEXT:    v_or3_b32 v2, v0, v3, v4
+; GFX9-NEXT:    v_or3_b32 v2, v0, v4, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -618,8 +620,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    s_and_b32 s1, s3, 3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
 ; GFX8-NEXT:    s_and_b32 s2, s2, s0
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
@@ -629,20 +631,22 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -670,7 +674,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -685,7 +688,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -717,8 +719,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s3, s6
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    s_and_b32 s2, s5, s6
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s2, s4, 3
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 3
@@ -727,11 +728,12 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, s2, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v1, v0, s6, v1
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v2, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX9-NEXT:    v_or3_b32 v2, v1, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -741,7 +743,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX8-NEXT:    s_movk_i32 s5, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 8
 ; GFX8-NEXT:    s_and_b32 s1, s1, s5
@@ -753,8 +755,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s1, s2, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, s5
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 3
@@ -764,13 +765,14 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -792,8 +794,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s1, s2, s5
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s3, s5
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_and_b32 s1, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 3
@@ -811,8 +812,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_and_b32_e32 v1, s5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -844,8 +844,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s3, s6
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    s_and_b32 s2, s5, s6
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s2, s4, s6
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v0, s2
@@ -853,11 +852,12 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_and_or_b32 v0, s1, v0, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v1, v0, s6, v1
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v2, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX9-NEXT:    v_or3_b32 v2, v1, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -869,7 +869,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_movk_i32 s5, 0xff
 ; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 8
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 8
 ; GFX8-NEXT:    s_and_b32 s1, s1, s5
@@ -881,8 +881,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s1, s2, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, s5
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s4, s5
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v0, s1
@@ -891,13 +890,14 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -920,8 +920,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s1, s2, s5
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s3, s5
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_and_b32 s1, s4, s5
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, s1, v0
@@ -939,8 +938,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_and_b32_e32 v1, s5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -971,20 +969,20 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s2, s3, s5
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    s_and_b32 s2, s4, s5
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s5
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_or_b32 s1, s1, s2
+; GFX9-NEXT:    s_lshl_b32 s2, s4, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_and_or_b32 v0, s1, v1, v0
 ; GFX9-NEXT:    s_mov_b32 s0, 8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v1, v0, s5, v1
-; GFX9-NEXT:    v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v2, v1, v2, v0
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX9-NEXT:    v_or3_b32 v2, v1, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -1007,23 +1005,23 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, s4
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s4
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1047,8 +1045,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s1, s2, s4
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s3, s4
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, s4, v1
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
@@ -1065,8 +1062,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -1094,18 +1090,20 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v4
-; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v5
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v1
-; GFX9-NEXT:    v_or3_b32 v2, v0, v3, v4
+; GFX9-NEXT:    v_or3_b32 v2, v0, v4, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -1116,8 +1114,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    s_and_b32 s1, s2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v2, s1
@@ -1127,20 +1125,22 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX8-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1168,7 +1168,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, s0, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
@@ -1183,7 +1182,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -1211,18 +1209,20 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v3
-; GFX9-NEXT:    v_or3_b32 v0, v0, v4, v5
+; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v4
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s2, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v1
-; GFX9-NEXT:    v_or3_b32 v2, v0, v3, v4
+; GFX9-NEXT:    v_or3_b32 v2, v0, v4, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -1244,20 +1244,22 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1285,7 +1287,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
@@ -1300,7 +1301,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -1329,18 +1329,20 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v5
-; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v7
+; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v6
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v5, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, v1, v2
-; GFX9-NEXT:    v_or3_b32 v2, v0, v3, v5
+; GFX9-NEXT:    v_or3_b32 v2, v0, v5, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -1351,8 +1353,8 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_movk_i32 s0, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s0
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
@@ -1362,20 +1364,22 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GFX8-NEXT:    v_and_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1403,7 +1407,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v5, s2, v5
 ; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-NEXT:    v_and_b32_e32 v6, s2, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
@@ -1412,18 +1415,17 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v2, v2, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, v3, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
@@ -1451,8 +1453,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s3, s10
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s2, s6, s10
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s6, 24
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
 ; GFX9-NEXT:    s_and_b32 s2, s7, s10
@@ -1464,8 +1465,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s8, s10
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    s_and_b32 s2, s9, s10
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s9, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s5, 2
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
@@ -1491,8 +1491,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s3, s10
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s2, s4, s10
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s4, 24
 ; GFX9-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
 ; GFX9-NEXT:    s_and_b32 s2, s5, s10
@@ -1504,8 +1503,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s6, s10
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    s_and_b32 s2, s7, s10
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s7, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -1529,8 +1527,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s2, s3, s10
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s6, s10
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s2, s6, 24
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s7, s10
@@ -1542,8 +1539,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s2, s8, s10
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
-; GFX8-NEXT:    s_and_b32 s2, s9, s10
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s2, s9, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_lshr_b32 s2, s5, 2
 ; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
@@ -1569,8 +1565,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s2, s3, s10
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s4, s10
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s2, s4, 24
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s5, s10
@@ -1582,8 +1577,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s2, s6, s10
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
-; GFX8-NEXT:    s_and_b32 s2, s7, s10
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s2, s7, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -1605,8 +1599,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s2, s3, s10
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s2, s6, s10
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s2, s6, 24
 ; GFX7-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_and_b32 s2, s7, s10
@@ -1618,8 +1611,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s2, s8, s10
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
-; GFX7-NEXT:    s_and_b32 s2, s9, s10
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s2, s9, 24
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    s_lshr_b32 s2, s5, 2
 ; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
@@ -1645,8 +1637,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s4, s5, s10
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX7-NEXT:    s_or_b32 s2, s2, s4
-; GFX7-NEXT:    s_and_b32 s4, s6, s10
-; GFX7-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX7-NEXT:    s_lshl_b32 s4, s6, 24
 ; GFX7-NEXT:    s_lshr_b32 s7, s3, 8
 ; GFX7-NEXT:    s_or_b32 s2, s2, s4
 ; GFX7-NEXT:    s_and_b32 s4, s7, s10
@@ -1658,8 +1649,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s4, s8, s10
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX7-NEXT:    s_or_b32 s3, s3, s4
-; GFX7-NEXT:    s_and_b32 s4, s9, s10
-; GFX7-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX7-NEXT:    s_lshl_b32 s4, s9, 24
 ; GFX7-NEXT:    s_or_b32 s3, s3, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
@@ -1679,7 +1669,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s1, s3, 2
 ; GFX9-NEXT:    s_and_b32 s3, s3, 3
@@ -1690,37 +1679,42 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX9-NEXT:    s_not_b32 s3, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v4
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v9, v1, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v5
-; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v7
-; GFX9-NEXT:    v_or3_b32 v1, v1, v8, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v6
+; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v5
+; GFX9-NEXT:    v_or3_b32 v1, v1, v9, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX9-NEXT:    v_and_or_b32 v3, v4, s3, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v3
-; GFX9-NEXT:    v_and_b32_sdwa v7, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v4
+; GFX9-NEXT:    v_or3_b32 v1, v1, v8, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
@@ -1742,19 +1736,21 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v10, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v9
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_and_b32_sdwa v10, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -1762,20 +1758,22 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
@@ -1810,8 +1808,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
-; GFX7-NEXT:    v_and_b32_e32 v7, s6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
@@ -1839,11 +1835,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v3, s6, v3
 ; GFX7-NEXT:    v_and_b32_e32 v6, s6, v6
 ; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
-; GFX7-NEXT:    v_and_b32_e32 v7, s6, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
@@ -1881,8 +1875,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s3, s10
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s2, s6, s10
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s6, 24
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
 ; GFX9-NEXT:    s_and_b32 s2, s7, s10
@@ -1894,8 +1887,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s8, s10
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    s_and_b32 s2, s9, s10
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s9, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_lshr_b32 s2, s4, 2
 ; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
@@ -1913,17 +1905,19 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s10, v2
-; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v2, v4, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX9-NEXT:    v_or3_b32 v0, v2, v0, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX9-NEXT:    v_and_or_b32 v2, v1, s10, v2
-; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v1, v2, v3, v1
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
+; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -1933,7 +1927,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX8-NEXT:    s_movk_i32 s9, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v4, 8
+; GFX8-NEXT:    v_mov_b32_e32 v6, 8
+; GFX8-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX8-NEXT:    s_and_b32 s2, s2, s9
@@ -1945,8 +1940,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s2, s3, s9
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s5, s9
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s6, s9
@@ -1958,8 +1952,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s2, s7, s9
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
-; GFX8-NEXT:    s_and_b32 s2, s8, s9
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s2, s8, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_lshr_b32 s2, s4, 2
 ; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
@@ -1978,21 +1971,21 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s9
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2014,8 +2007,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s2, s3, s9
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s2, s5, s9
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX7-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_and_b32 s2, s6, s9
@@ -2027,8 +2019,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s2, s7, s9
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
-; GFX7-NEXT:    s_and_b32 s2, s8, s9
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s2, s8, 24
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    s_lshr_b32 s2, s4, 2
 ; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
@@ -2055,8 +2046,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_and_b32_e32 v2, s9, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s9, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s9, v5
@@ -2068,8 +2058,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_and_b32_e32 v2, s9, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s9, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v7
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -2101,8 +2090,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s3, s10
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s2, s6, s10
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s6, 24
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
 ; GFX9-NEXT:    s_and_b32 s2, s7, s10
@@ -2114,8 +2102,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s8, s10
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    s_and_b32 s2, s9, s10
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s9, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -2133,17 +2120,19 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_mov_b32 s5, 8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s10, v2
-; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v2, v4, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX9-NEXT:    v_or3_b32 v0, v2, v0, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX9-NEXT:    v_and_or_b32 v2, v1, s10, v2
-; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v1, v2, v3, v1
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
+; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -2167,8 +2156,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s2, s3, s9
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s5, s9
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX8-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s6, s9
@@ -2180,8 +2168,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s2, s7, s9
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
-; GFX8-NEXT:    s_and_b32 s2, s8, s9
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s2, s8, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
@@ -2198,23 +2185,24 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s9
-; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v6, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2238,8 +2226,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s2, s3, s9
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s2, s5, s9
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX7-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_and_b32 s2, s6, s9
@@ -2251,8 +2238,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s2, s7, s9
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
-; GFX7-NEXT:    s_and_b32 s2, s8, s9
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s2, s8, 24
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s1
@@ -2279,8 +2265,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_and_b32_e32 v2, s9, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s9, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s9, v5
@@ -2292,8 +2277,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_and_b32_e32 v2, s9, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s9, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v7
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -2325,8 +2309,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s3, s9
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s2, s5, s9
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s5, 24
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s2
 ; GFX9-NEXT:    s_and_b32 s2, s6, s9
@@ -2338,8 +2321,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_and_b32 s2, s7, s9
 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    s_and_b32 s2, s8, s9
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s2, s8, 24
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s9
@@ -2356,17 +2338,19 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX9-NEXT:    s_mov_b32 s4, 8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX9-NEXT:    v_and_or_b32 v2, v0, s9, v2
-; GFX9-NEXT:    v_and_b32_sdwa v4, v0, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v2, v4, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX9-NEXT:    v_or3_b32 v0, v2, v0, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX9-NEXT:    v_and_or_b32 v2, v1, s9, v2
-; GFX9-NEXT:    v_and_b32_sdwa v3, v1, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v1, v2, v3, v1
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
+; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -2390,8 +2374,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s2, s3, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s2, s4, s8
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s2, s4, 24
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s5, s8
@@ -2403,8 +2386,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    s_and_b32 s2, s6, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
-; GFX8-NEXT:    s_and_b32 s2, s7, s8
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshl_b32 s2, s7, 24
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
@@ -2420,23 +2402,24 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v6, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v7, s8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX8-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2460,8 +2443,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s2, s3, s8
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s2, s4, s8
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s2, s4, 24
 ; GFX7-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s2
 ; GFX7-NEXT:    s_and_b32 s2, s5, s8
@@ -2473,8 +2455,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    s_and_b32 s2, s6, s8
 ; GFX7-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
-; GFX7-NEXT:    s_and_b32 s2, s7, s8
-; GFX7-NEXT:    s_lshl_b32 s2, s2, 24
+; GFX7-NEXT:    s_lshl_b32 s2, s7, 24
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
@@ -2501,8 +2482,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_and_b32_e32 v2, s8, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s8, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s8, v5
@@ -2514,8 +2494,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
 ; GFX7-NEXT:    v_and_b32_e32 v2, s8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s8, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v7
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -2533,7 +2512,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    s_movk_i32 s3, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
@@ -2543,37 +2521,42 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v9, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v10, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v11, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v6
-; GFX9-NEXT:    v_and_b32_sdwa v10, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v11, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v7
-; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v9
-; GFX9-NEXT:    v_or3_b32 v1, v1, v10, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v8
+; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v7
+; GFX9-NEXT:    v_or3_b32 v1, v1, v11, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
 ; GFX9-NEXT:    v_and_or_b32 v2, v6, v2, v5
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v2
-; GFX9-NEXT:    v_and_b32_sdwa v7, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v4
+; GFX9-NEXT:    v_or3_b32 v1, v1, v8, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
@@ -2596,39 +2579,43 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v12, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v12, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v13, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v12
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_and_b32_sdwa v13, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
@@ -2665,8 +2652,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX7-NEXT:    v_and_b32_e32 v1, s3, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; GFX7-NEXT:    v_and_b32_e32 v7, s3, v7
-; GFX7-NEXT:    v_and_b32_e32 v10, s3, v10
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -2696,8 +2681,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, s3, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, s3, v4
-; GFX7-NEXT:    v_and_b32_e32 v7, s3, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
@@ -2721,7 +2704,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    s_movk_i32 s3, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s1, s2, 2
 ; GFX9-NEXT:    s_and_b32 s2, s2, 3
@@ -2730,37 +2712,42 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
 ; GFX9-NEXT:    s_not_b32 s2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v4
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v9, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v5
-; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v7
-; GFX9-NEXT:    v_or3_b32 v1, v1, v8, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v6
+; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v5
+; GFX9-NEXT:    v_or3_b32 v1, v1, v9, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX9-NEXT:    v_and_or_b32 v2, v4, s2, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v5, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v7, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s3, v2
-; GFX9-NEXT:    v_and_b32_sdwa v7, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, s3, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GFX9-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX9-NEXT:    v_or3_b32 v0, v0, v7, v4
+; GFX9-NEXT:    v_or3_b32 v1, v1, v8, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
@@ -2782,19 +2769,21 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    s_not_b32 s0, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v11, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_and_b32_sdwa v11, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
@@ -2802,20 +2791,22 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
@@ -2851,8 +2842,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, s3, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GFX7-NEXT:    v_and_b32_e32 v5, s3, v5
-; GFX7-NEXT:    v_and_b32_e32 v8, s3, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
@@ -2883,8 +2872,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, s3, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, s3, v4
-; GFX7-NEXT:    v_and_b32_e32 v7, s3, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
@@ -2908,7 +2895,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    s_movk_i32 s1, 0xff
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
@@ -2918,37 +2904,42 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v9, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v10, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v11, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v12, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s1, v7
-; GFX9-NEXT:    v_and_b32_sdwa v11, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v12, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s1, v8
-; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v10
-; GFX9-NEXT:    v_or3_b32 v1, v1, v11, v12
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s1, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_or3_b32 v0, v0, v11, v8
+; GFX9-NEXT:    v_or3_b32 v1, v1, v12, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
 ; GFX9-NEXT:    v_and_or_b32 v2, v7, v3, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v2
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v3
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_or3_b32 v0, v0, v8, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_or3_b32 v0, v0, v6, v7
-; GFX9-NEXT:    v_or3_b32 v1, v1, v8, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v5
+; GFX9-NEXT:    v_or3_b32 v1, v1, v9, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
@@ -2971,39 +2962,43 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v12, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_and_b32_sdwa v13, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v13, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v7, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, v5, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX8-NEXT:    v_and_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
@@ -3037,11 +3032,9 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_and_b32_e32 v7, s0, v7
 ; GFX7-NEXT:    v_and_b32_e32 v10, s0, v10
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT:    v_and_b32_e32 v8, s0, v8
-; GFX7-NEXT:    v_and_b32_e32 v11, s0, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
@@ -3061,29 +3054,27 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v6, v6, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX7-NEXT:    v_and_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_and_b32_e32 v7, v7, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GFX7-NEXT:    v_and_b32_e32 v5, v5, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
   %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
@@ -3110,8 +3101,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s6, s7, s18
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s6
-; GFX9-NEXT:    s_and_b32 s6, s8, s18
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshl_b32 s6, s8, 24
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s6
 ; GFX9-NEXT:    s_and_b32 s6, s9, s18
@@ -3123,8 +3113,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s6, s10, s18
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s6
-; GFX9-NEXT:    s_and_b32 s6, s11, s18
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshl_b32 s6, s11, 24
 ; GFX9-NEXT:    s_lshr_b32 s12, s2, 8
 ; GFX9-NEXT:    s_or_b32 s1, s1, s6
 ; GFX9-NEXT:    s_and_b32 s6, s12, s18
@@ -3136,8 +3125,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s6, s13, s18
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s2, s2, s6
-; GFX9-NEXT:    s_and_b32 s6, s14, s18
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshl_b32 s6, s14, 24
 ; GFX9-NEXT:    s_lshr_b32 s15, s3, 8
 ; GFX9-NEXT:    s_or_b32 s2, s2, s6
 ; GFX9-NEXT:    s_and_b32 s6, s15, s18
@@ -3149,8 +3137,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s6, s16, s18
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s3, s3, s6
-; GFX9-NEXT:    s_and_b32 s6, s17, s18
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshl_b32 s6, s17, 24
 ; GFX9-NEXT:    s_or_b32 s3, s3, s6
 ; GFX9-NEXT:    s_lshr_b32 s6, s5, 2
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
@@ -3184,8 +3171,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s4, s5, s18
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s4
-; GFX9-NEXT:    s_and_b32 s4, s6, s18
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX9-NEXT:    s_lshl_b32 s4, s6, 24
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s4
 ; GFX9-NEXT:    s_and_b32 s4, s7, s18
@@ -3197,8 +3183,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s4, s8, s18
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s4
-; GFX9-NEXT:    s_and_b32 s4, s9, s18
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX9-NEXT:    s_lshl_b32 s4, s9, 24
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 8
 ; GFX9-NEXT:    s_or_b32 s1, s1, s4
 ; GFX9-NEXT:    s_and_b32 s4, s10, s18
@@ -3210,8 +3195,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s4, s11, s18
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-NEXT:    s_or_b32 s2, s2, s4
-; GFX9-NEXT:    s_and_b32 s4, s12, s18
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX9-NEXT:    s_lshl_b32 s4, s12, 24
 ; GFX9-NEXT:    s_lshr_b32 s13, s3, 8
 ; GFX9-NEXT:    s_or_b32 s2, s2, s4
 ; GFX9-NEXT:    s_and_b32 s4, s13, s18
@@ -3223,8 +3207,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s4, s14, s18
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-NEXT:    s_or_b32 s3, s3, s4
-; GFX9-NEXT:    s_and_b32 s4, s15, s18
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX9-NEXT:    s_lshl_b32 s4, s15, 24
 ; GFX9-NEXT:    s_or_b32 s3, s3, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -3250,8 +3233,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s6, s7, s18
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
-; GFX8-NEXT:    s_and_b32 s6, s8, s18
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX8-NEXT:    s_lshl_b32 s6, s8, 24
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
 ; GFX8-NEXT:    s_and_b32 s6, s9, s18
@@ -3263,8 +3245,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s6, s10, s18
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s6
-; GFX8-NEXT:    s_and_b32 s6, s11, s18
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX8-NEXT:    s_lshl_b32 s6, s11, 24
 ; GFX8-NEXT:    s_lshr_b32 s12, s2, 8
 ; GFX8-NEXT:    s_or_b32 s1, s1, s6
 ; GFX8-NEXT:    s_and_b32 s6, s12, s18
@@ -3276,8 +3257,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s6, s13, s18
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
-; GFX8-NEXT:    s_and_b32 s6, s14, s18
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX8-NEXT:    s_lshl_b32 s6, s14, 24
 ; GFX8-NEXT:    s_lshr_b32 s15, s3, 8
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_and_b32 s6, s15, s18
@@ -3289,8 +3269,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s6, s16, s18
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s6
-; GFX8-NEXT:    s_and_b32 s6, s17, s18
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX8-NEXT:    s_lshl_b32 s6, s17, 24
 ; GFX8-NEXT:    s_or_b32 s3, s3, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s5, 2
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 1
@@ -3324,8 +3303,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s4, s5, s18
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s4
-; GFX8-NEXT:    s_and_b32 s4, s6, s18
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX8-NEXT:    s_lshl_b32 s4, s6, 24
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_and_b32 s4, s7, s18
@@ -3337,8 +3315,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s4, s8, s18
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s4
-; GFX8-NEXT:    s_and_b32 s4, s9, s18
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX8-NEXT:    s_lshl_b32 s4, s9, 24
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 8
 ; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_and_b32 s4, s10, s18
@@ -3350,8 +3327,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s4, s11, s18
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
-; GFX8-NEXT:    s_and_b32 s4, s12, s18
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX8-NEXT:    s_lshl_b32 s4, s12, 24
 ; GFX8-NEXT:    s_lshr_b32 s13, s3, 8
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_and_b32 s4, s13, s18
@@ -3363,8 +3339,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s4, s14, s18
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
-; GFX8-NEXT:    s_and_b32 s4, s15, s18
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX8-NEXT:    s_lshl_b32 s4, s15, 24
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -3388,8 +3363,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s6, s7, s18
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s6
-; GFX7-NEXT:    s_and_b32 s6, s8, s18
-; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshl_b32 s6, s8, 24
 ; GFX7-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s6
 ; GFX7-NEXT:    s_and_b32 s6, s9, s18
@@ -3401,8 +3375,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s6, s10, s18
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX7-NEXT:    s_or_b32 s1, s1, s6
-; GFX7-NEXT:    s_and_b32 s6, s11, s18
-; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshl_b32 s6, s11, 24
 ; GFX7-NEXT:    s_lshr_b32 s12, s2, 8
 ; GFX7-NEXT:    s_or_b32 s1, s1, s6
 ; GFX7-NEXT:    s_and_b32 s6, s12, s18
@@ -3414,8 +3387,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s6, s13, s18
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX7-NEXT:    s_or_b32 s2, s2, s6
-; GFX7-NEXT:    s_and_b32 s6, s14, s18
-; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshl_b32 s6, s14, 24
 ; GFX7-NEXT:    s_lshr_b32 s15, s3, 8
 ; GFX7-NEXT:    s_or_b32 s2, s2, s6
 ; GFX7-NEXT:    s_and_b32 s6, s15, s18
@@ -3427,8 +3399,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s6, s16, s18
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX7-NEXT:    s_or_b32 s3, s3, s6
-; GFX7-NEXT:    s_and_b32 s6, s17, s18
-; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshl_b32 s6, s17, 24
 ; GFX7-NEXT:    s_or_b32 s3, s3, s6
 ; GFX7-NEXT:    s_lshr_b32 s6, s5, 2
 ; GFX7-NEXT:    s_cmp_eq_u32 s6, 1
@@ -3460,12 +3431,11 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX7-NEXT:    s_or_b32 s4, s5, s4
 ; GFX7-NEXT:    s_and_b32 s5, s6, s18
-; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX7-NEXT:    s_lshr_b32 s9, s7, 8
-; GFX7-NEXT:    s_or_b32 s4, s4, s5
-; GFX7-NEXT:    s_and_b32 s5, s8, s18
+; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX7-NEXT:    s_and_b32 s6, s9, s18
-; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_or_b32 s4, s4, s5
+; GFX7-NEXT:    s_lshl_b32 s5, s8, 24
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
 ; GFX7-NEXT:    s_lshr_b32 s10, s7, 16
 ; GFX7-NEXT:    s_and_b32 s5, s7, s18
@@ -3475,8 +3445,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX7-NEXT:    s_lshr_b32 s11, s7, 24
 ; GFX7-NEXT:    s_or_b32 s5, s5, s6
-; GFX7-NEXT:    s_and_b32 s6, s11, s18
-; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshl_b32 s6, s11, 24
 ; GFX7-NEXT:    s_lshr_b32 s12, s2, 8
 ; GFX7-NEXT:    s_or_b32 s5, s5, s6
 ; GFX7-NEXT:    s_and_b32 s6, s12, s18
@@ -3488,8 +3457,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s6, s13, s18
 ; GFX7-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX7-NEXT:    s_or_b32 s2, s2, s6
-; GFX7-NEXT:    s_and_b32 s6, s14, s18
-; GFX7-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX7-NEXT:    s_lshl_b32 s6, s14, 24
 ; GFX7-NEXT:    s_lshr_b32 s15, s3, 8
 ; GFX7-NEXT:    s_or_b32 s6, s2, s6
 ; GFX7-NEXT:    s_lshr_b32 s16, s3, 16
@@ -3501,8 +3469,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s3, s16, s18
 ; GFX7-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX7-NEXT:    s_or_b32 s2, s2, s3
-; GFX7-NEXT:    s_and_b32 s3, s17, s18
-; GFX7-NEXT:    s_lshl_b32 s3, s3, 24
+; GFX7-NEXT:    s_lshl_b32 s3, s17, 24
 ; GFX7-NEXT:    s_or_b32 s7, s2, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
@@ -3538,31 +3505,35 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
-; GFX9-NEXT:    v_and_b32_sdwa v10, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v11, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v6
-; GFX9-NEXT:    v_and_b32_sdwa v12, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v13, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v7
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v14, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v15, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or3_b32 v0, v0, v10, v11
-; GFX9-NEXT:    v_or3_b32 v1, v1, v12, v13
-; GFX9-NEXT:    v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v17, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v9
-; GFX9-NEXT:    v_or3_b32 v2, v2, v14, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 8, v3
+; GFX9-NEXT:    v_and_b32_sdwa v14, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v8
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
+; GFX9-NEXT:    v_and_b32_sdwa v16, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v10
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or3_b32 v0, v0, v14, v7
+; GFX9-NEXT:    v_or3_b32 v1, v1, v15, v9
+; GFX9-NEXT:    v_and_b32_sdwa v17, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v12
+; GFX9-NEXT:    v_or3_b32 v2, v2, v16, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
-; GFX9-NEXT:    v_or3_b32 v3, v3, v16, v17
+; GFX9-NEXT:    v_or3_b32 v3, v3, v17, v13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[2:3]
 ; GFX9-NEXT:    v_and_or_b32 v5, v6, s5, v5
@@ -3572,30 +3543,34 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v9, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v10, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v13, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v14, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s6, v5
-; GFX9-NEXT:    v_and_b32_sdwa v11, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v12, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v6
-; GFX9-NEXT:    v_and_b32_sdwa v13, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v14, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v7
-; GFX9-NEXT:    v_and_b32_sdwa v15, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v16, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_or3_b32 v0, v0, v9, v10
-; GFX9-NEXT:    v_or3_b32 v1, v1, v11, v12
-; GFX9-NEXT:    v_or3_b32 v2, v2, v13, v14
-; GFX9-NEXT:    v_or3_b32 v3, v3, v15, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v9
+; GFX9-NEXT:    v_or3_b32 v0, v0, v13, v6
+; GFX9-NEXT:    v_or3_b32 v1, v1, v14, v8
+; GFX9-NEXT:    v_or3_b32 v2, v2, v15, v10
+; GFX9-NEXT:    v_or3_b32 v3, v3, v16, v11
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
@@ -3619,34 +3594,38 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v12, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v2
+; GFX8-NEXT:    v_and_b32_sdwa v15, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v13, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v14, v1, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
-; GFX8-NEXT:    v_and_b32_sdwa v15, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v15
-; GFX8-NEXT:    v_and_b32_sdwa v17, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v18, v3, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v14
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_sdwa v16, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX8-NEXT:    v_and_b32_sdwa v17, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v3
+; GFX8-NEXT:    v_and_b32_sdwa v18, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v14
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v18
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v13
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
 ; GFX8-NEXT:    v_and_b32_e32 v4, s6, v4
@@ -3657,34 +3636,38 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v3
+; GFX8-NEXT:    v_and_b32_sdwa v14, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v12, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v13, v1, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v14, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v15, v2, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v16, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v6, v3, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v15, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v6, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v12
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v14
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v11
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v12
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -3728,8 +3711,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v6, s6, v6
-; GFX7-NEXT:    v_and_b32_e32 v9, s6, v9
 ; GFX7-NEXT:    v_and_b32_e32 v11, s6, v11
 ; GFX7-NEXT:    v_and_b32_e32 v13, s6, v13
 ; GFX7-NEXT:    v_and_b32_e32 v2, s6, v2
@@ -3739,7 +3720,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v7
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
-; GFX7-NEXT:    v_and_b32_e32 v12, s6, v12
 ; GFX7-NEXT:    v_and_b32_e32 v14, s6, v14
 ; GFX7-NEXT:    v_and_b32_e32 v3, s6, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
@@ -3749,7 +3729,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX7-NEXT:    v_and_b32_e32 v15, s6, v15
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
@@ -3774,16 +3753,10 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v7
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
@@ -3793,8 +3766,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v10
@@ -3806,8 +3778,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v13
@@ -3815,12 +3786,16 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, s6, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, s6, v5
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm
@@ -3848,8 +3823,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s6, s7, s18
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s6
-; GFX9-NEXT:    s_and_b32 s6, s8, s18
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshl_b32 s6, s8, 24
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s6
 ; GFX9-NEXT:    s_and_b32 s6, s9, s18
@@ -3861,8 +3835,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s6, s10, s18
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s6
-; GFX9-NEXT:    s_and_b32 s6, s11, s18
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshl_b32 s6, s11, 24
 ; GFX9-NEXT:    s_lshr_b32 s12, s2, 8
 ; GFX9-NEXT:    s_or_b32 s1, s1, s6
 ; GFX9-NEXT:    s_and_b32 s6, s12, s18
@@ -3874,8 +3847,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s6, s13, s18
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s2, s2, s6
-; GFX9-NEXT:    s_and_b32 s6, s14, s18
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshl_b32 s6, s14, 24
 ; GFX9-NEXT:    s_lshr_b32 s15, s3, 8
 ; GFX9-NEXT:    s_or_b32 s2, s2, s6
 ; GFX9-NEXT:    s_and_b32 s6, s15, s18
@@ -3887,8 +3859,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s6, s16, s18
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s3, s3, s6
-; GFX9-NEXT:    s_and_b32 s6, s17, s18
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-NEXT:    s_lshl_b32 s6, s17, 24
 ; GFX9-NEXT:    s_or_b32 s3, s3, s6
 ; GFX9-NEXT:    s_lshr_b32 s6, s4, 2
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
@@ -3916,30 +3887,34 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v4, v0, s18, v4
-; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v4, v8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-NEXT:    v_and_or_b32 v5, v1, s18, v5
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v1, v5, v8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_and_or_b32 v5, v1, s18, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX9-NEXT:    v_or3_b32 v1, v5, v1, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v5, v2, s18, v5
-; GFX9-NEXT:    v_and_b32_sdwa v6, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v2, v5, v6, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
+; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
 ; GFX9-NEXT:    v_and_or_b32 v4, v3, s18, v4
-; GFX9-NEXT:    v_and_b32_sdwa v5, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v3, v3, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v3, v4, v5, v3
+; GFX9-NEXT:    v_and_b32_sdwa v3, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v11
+; GFX9-NEXT:    v_or3_b32 v3, v4, v3, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -3949,7 +3924,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
 ; GFX8-NEXT:    s_movk_i32 s17, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v8, 8
+; GFX8-NEXT:    v_mov_b32_e32 v12, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s5, s0, 8
 ; GFX8-NEXT:    s_and_b32 s5, s5, s17
@@ -3961,8 +3936,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s5, s6, s17
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s5
-; GFX8-NEXT:    s_and_b32 s5, s7, s17
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX8-NEXT:    s_lshl_b32 s5, s7, 24
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s5
 ; GFX8-NEXT:    s_and_b32 s5, s8, s17
@@ -3974,8 +3948,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s5, s9, s17
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s5
-; GFX8-NEXT:    s_and_b32 s5, s10, s17
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX8-NEXT:    s_lshl_b32 s5, s10, 24
 ; GFX8-NEXT:    s_lshr_b32 s11, s2, 8
 ; GFX8-NEXT:    s_or_b32 s1, s1, s5
 ; GFX8-NEXT:    s_and_b32 s5, s11, s17
@@ -3987,8 +3960,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s5, s12, s17
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX8-NEXT:    s_or_b32 s2, s2, s5
-; GFX8-NEXT:    s_and_b32 s5, s13, s17
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX8-NEXT:    s_lshl_b32 s5, s13, 24
 ; GFX8-NEXT:    s_lshr_b32 s14, s3, 8
 ; GFX8-NEXT:    s_or_b32 s2, s2, s5
 ; GFX8-NEXT:    s_and_b32 s5, s14, s17
@@ -4000,8 +3972,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s5, s15, s17
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s5
-; GFX8-NEXT:    s_and_b32 s5, s16, s17
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX8-NEXT:    s_lshl_b32 s5, s16, 24
 ; GFX8-NEXT:    s_or_b32 s3, s3, s5
 ; GFX8-NEXT:    s_lshr_b32 s5, s4, 2
 ; GFX8-NEXT:    s_cmp_eq_u32 s5, 1
@@ -4030,35 +4001,39 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_mov_b32_e32 v8, s17
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v12, s17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v9
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v9
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-NEXT:    v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v9
 ; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -4080,8 +4055,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s5, s6, s17
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s5
-; GFX7-NEXT:    s_and_b32 s5, s7, s17
-; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshl_b32 s5, s7, 24
 ; GFX7-NEXT:    s_lshr_b32 s8, s1, 8
 ; GFX7-NEXT:    s_or_b32 s0, s0, s5
 ; GFX7-NEXT:    s_and_b32 s5, s8, s17
@@ -4093,8 +4067,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s5, s9, s17
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX7-NEXT:    s_or_b32 s1, s1, s5
-; GFX7-NEXT:    s_and_b32 s5, s10, s17
-; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshl_b32 s5, s10, 24
 ; GFX7-NEXT:    s_lshr_b32 s11, s2, 8
 ; GFX7-NEXT:    s_or_b32 s1, s1, s5
 ; GFX7-NEXT:    s_and_b32 s5, s11, s17
@@ -4106,8 +4079,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s5, s12, s17
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX7-NEXT:    s_or_b32 s2, s2, s5
-; GFX7-NEXT:    s_and_b32 s5, s13, s17
-; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshl_b32 s5, s13, 24
 ; GFX7-NEXT:    s_lshr_b32 s14, s3, 8
 ; GFX7-NEXT:    s_or_b32 s2, s2, s5
 ; GFX7-NEXT:    s_and_b32 s5, s14, s17
@@ -4119,8 +4091,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s5, s15, s17
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX7-NEXT:    s_or_b32 s3, s3, s5
-; GFX7-NEXT:    s_and_b32 s5, s16, s17
-; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshl_b32 s5, s16, 24
 ; GFX7-NEXT:    s_or_b32 s3, s3, s5
 ; GFX7-NEXT:    s_lshr_b32 s5, s4, 2
 ; GFX7-NEXT:    s_cmp_eq_u32 s5, 1
@@ -4157,8 +4128,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s17, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s17, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s17, v7
@@ -4170,8 +4140,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s17, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s17, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s17, v10
@@ -4183,8 +4152,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s17, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s17, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s17, v13
@@ -4196,8 +4164,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s17, v14
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s17, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v15
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -4229,8 +4196,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s5, s7, s18
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s5
-; GFX9-NEXT:    s_and_b32 s5, s8, s18
-; GFX9-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX9-NEXT:    s_lshl_b32 s5, s8, 24
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX9-NEXT:    s_or_b32 s8, s0, s5
 ; GFX9-NEXT:    s_lshr_b32 s10, s1, 16
@@ -4242,8 +4208,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s1, s10, s18
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s11, s18
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s11, 24
 ; GFX9-NEXT:    s_lshr_b32 s12, s2, 8
 ; GFX9-NEXT:    s_or_b32 s9, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s12, s18
@@ -4255,8 +4220,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s14, s2, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s14, s18
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s14, 24
 ; GFX9-NEXT:    s_lshr_b32 s15, s3, 8
 ; GFX9-NEXT:    s_or_b32 s10, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s15, s18
@@ -4268,8 +4232,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_lshr_b32 s17, s3, 24
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s17, s18
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s17, 24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NEXT:    s_or_b32 s11, s0, s1
@@ -4289,38 +4252,42 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s6, 8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    s_mov_b32 s6, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_and_or_b32 v4, v0, s18, v4
-; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v4, v8, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v4, v0, s18, v4
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-NEXT:    v_and_or_b32 v5, v1, s18, v5
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v1, v5, v8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_and_or_b32 v5, v1, s18, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX9-NEXT:    v_or3_b32 v1, v5, v1, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v5, v2, s18, v5
-; GFX9-NEXT:    v_and_b32_sdwa v6, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v2, v5, v6, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
+; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
 ; GFX9-NEXT:    v_and_or_b32 v4, v3, s18, v4
-; GFX9-NEXT:    v_and_b32_sdwa v5, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v3, v3, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v3, v4, v5, v3
+; GFX9-NEXT:    v_and_b32_sdwa v3, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v11
+; GFX9-NEXT:    v_or3_b32 v3, v4, v3, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -4344,8 +4311,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s5, s6, s18
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s5
-; GFX8-NEXT:    s_and_b32 s5, s7, s18
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX8-NEXT:    s_lshl_b32 s5, s7, 24
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX8-NEXT:    s_or_b32 s8, s0, s5
 ; GFX8-NEXT:    s_lshr_b32 s10, s1, 16
@@ -4357,8 +4323,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s1, s10, s18
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s11, s18
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s11, 24
 ; GFX8-NEXT:    s_lshr_b32 s12, s2, 8
 ; GFX8-NEXT:    s_or_b32 s9, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s12, s18
@@ -4370,8 +4335,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s14, s2, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s14, s18
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s14, 24
 ; GFX8-NEXT:    s_lshr_b32 s15, s3, 8
 ; GFX8-NEXT:    s_or_b32 s10, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s15, s18
@@ -4383,8 +4347,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_lshr_b32 s17, s3, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s17, s18
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s17, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX8-NEXT:    s_or_b32 s11, s0, s1
@@ -4406,42 +4369,46 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX8-NEXT:    v_mov_b32_e32 v8, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_mov_b32_e32 v8, s18
+; GFX8-NEXT:    v_mov_b32_e32 v12, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_mov_b32_e32 v12, s18
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v9
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v9
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-NEXT:    v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v9
 ; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -4465,8 +4432,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s5, s6, s18
 ; GFX7-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s5
-; GFX7-NEXT:    s_and_b32 s5, s7, s18
-; GFX7-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshl_b32 s5, s7, 24
 ; GFX7-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX7-NEXT:    s_or_b32 s8, s0, s5
 ; GFX7-NEXT:    s_lshr_b32 s10, s1, 16
@@ -4478,8 +4444,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s1, s10, s18
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s11, s18
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s11, 24
 ; GFX7-NEXT:    s_lshr_b32 s12, s2, 8
 ; GFX7-NEXT:    s_or_b32 s9, s0, s1
 ; GFX7-NEXT:    s_and_b32 s1, s12, s18
@@ -4491,8 +4456,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_lshr_b32 s14, s2, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s14, s18
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s14, 24
 ; GFX7-NEXT:    s_lshr_b32 s15, s3, 8
 ; GFX7-NEXT:    s_or_b32 s10, s0, s1
 ; GFX7-NEXT:    s_and_b32 s1, s15, s18
@@ -4504,8 +4468,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_lshr_b32 s17, s3, 24
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s17, s18
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s17, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX7-NEXT:    s_or_b32 s11, s0, s1
@@ -4542,8 +4505,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s18, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s18, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s18, v7
@@ -4555,8 +4517,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s18, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s18, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s18, v10
@@ -4568,8 +4529,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s18, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s18, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s18, v13
@@ -4581,8 +4541,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s18, v14
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s18, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v15
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -4614,8 +4573,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s4, s5, s17
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s4
-; GFX9-NEXT:    s_and_b32 s4, s6, s17
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX9-NEXT:    s_lshl_b32 s4, s6, 24
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX9-NEXT:    s_or_b32 s4, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 16
@@ -4627,8 +4585,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_and_b32 s1, s9, s17
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s10, s17
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s10, 24
 ; GFX9-NEXT:    s_lshr_b32 s11, s2, 8
 ; GFX9-NEXT:    s_or_b32 s5, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s11, s17
@@ -4640,8 +4597,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s13, s2, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s13, s17
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s13, 24
 ; GFX9-NEXT:    s_lshr_b32 s14, s3, 8
 ; GFX9-NEXT:    s_or_b32 s6, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s14, s17
@@ -4653,8 +4609,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    s_lshr_b32 s16, s3, 24
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s16, s17
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s16, 24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_or_b32 s7, s0, s1
@@ -4678,33 +4633,37 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s8, 8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_and_or_b32 v4, v0, s17, v4
-; GFX9-NEXT:    v_and_b32_sdwa v8, v0, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v0, v4, v8, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, s8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v4, v0, s17, v4
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GFX9-NEXT:    v_or3_b32 v0, v4, v0, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX9-NEXT:    v_and_or_b32 v5, v1, s17, v5
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v1, v5, v8, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_and_or_b32 v5, v1, s17, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v7
+; GFX9-NEXT:    v_or3_b32 v1, v5, v1, v6
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v5, v2, s17, v5
-; GFX9-NEXT:    v_and_b32_sdwa v6, v2, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v2, v5, v6, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v9
+; GFX9-NEXT:    v_or3_b32 v2, v5, v2, v6
 ; GFX9-NEXT:    v_and_or_b32 v4, v3, s17, v4
-; GFX9-NEXT:    v_and_b32_sdwa v5, v3, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v3, v3, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v3, v4, v5, v3
+; GFX9-NEXT:    v_and_b32_sdwa v3, v3, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 24, v11
+; GFX9-NEXT:    v_or3_b32 v3, v4, v3, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -4728,8 +4687,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s4, s5, s16
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s4
-; GFX8-NEXT:    s_and_b32 s4, s6, s16
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX8-NEXT:    s_lshl_b32 s4, s6, 24
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX8-NEXT:    s_or_b32 s4, s0, s4
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 16
@@ -4741,8 +4699,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_and_b32 s1, s8, s16
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s9, s16
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s9, 24
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 8
 ; GFX8-NEXT:    s_or_b32 s5, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s10, s16
@@ -4754,8 +4711,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s12, s2, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s12, s16
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s12, 24
 ; GFX8-NEXT:    s_lshr_b32 s13, s3, 8
 ; GFX8-NEXT:    s_or_b32 s6, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s13, s16
@@ -4767,8 +4723,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    s_lshr_b32 s15, s3, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s15, s16
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s15, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_or_b32 s7, s0, s1
@@ -4792,39 +4747,43 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX8-NEXT:    v_mov_b32_e32 v8, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_mov_b32_e32 v8, s16
-; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v12, 8
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v12, s16
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v9
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v9
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_and_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX8-NEXT:    v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v3, v3, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 24, v9
 ; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -4848,8 +4807,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s4, s5, s16
 ; GFX7-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s4
-; GFX7-NEXT:    s_and_b32 s4, s6, s16
-; GFX7-NEXT:    s_lshl_b32 s4, s4, 24
+; GFX7-NEXT:    s_lshl_b32 s4, s6, 24
 ; GFX7-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX7-NEXT:    s_or_b32 s4, s0, s4
 ; GFX7-NEXT:    s_lshr_b32 s8, s1, 16
@@ -4861,8 +4819,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_and_b32 s1, s8, s16
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s9, s16
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s9, 24
 ; GFX7-NEXT:    s_lshr_b32 s10, s2, 8
 ; GFX7-NEXT:    s_or_b32 s5, s0, s1
 ; GFX7-NEXT:    s_and_b32 s1, s10, s16
@@ -4874,8 +4831,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_lshr_b32 s12, s2, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s12, s16
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s12, 24
 ; GFX7-NEXT:    s_lshr_b32 s13, s3, 8
 ; GFX7-NEXT:    s_or_b32 s6, s0, s1
 ; GFX7-NEXT:    s_and_b32 s1, s13, s16
@@ -4887,8 +4843,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    s_lshr_b32 s15, s3, 24
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s15, s16
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX7-NEXT:    s_lshl_b32 s1, s15, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX7-NEXT:    s_or_b32 s7, s0, s1
@@ -4925,8 +4880,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s16, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s16, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s16, v7
@@ -4938,8 +4892,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s16, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s16, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s16, v10
@@ -4951,8 +4904,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s16, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s16, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s16, v13
@@ -4964,8 +4916,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
 ; GFX7-NEXT:    v_and_b32_e32 v4, s16, v14
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s16, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v15
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -4997,31 +4948,35 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, s0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v6
-; GFX9-NEXT:    v_and_b32_sdwa v12, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v13, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v8
-; GFX9-NEXT:    v_and_b32_sdwa v14, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v15, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v9
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v16, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v17, v5, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v10
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or3_b32 v3, v3, v12, v13
-; GFX9-NEXT:    v_or3_b32 v4, v4, v14, v15
-; GFX9-NEXT:    v_and_b32_sdwa v18, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v19, v6, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v6, v6, s6, v11
-; GFX9-NEXT:    v_or3_b32 v5, v5, v16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 8, v6
+; GFX9-NEXT:    v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v17, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v10
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v6
+; GFX9-NEXT:    v_and_b32_sdwa v18, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s6, v12
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or3_b32 v3, v3, v16, v9
+; GFX9-NEXT:    v_or3_b32 v4, v4, v17, v11
+; GFX9-NEXT:    v_and_b32_sdwa v19, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 24, v15
+; GFX9-NEXT:    v_and_or_b32 v6, v6, s6, v14
+; GFX9-NEXT:    v_or3_b32 v5, v5, v18, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v3, v4, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
-; GFX9-NEXT:    v_or3_b32 v6, v6, v18, v19
+; GFX9-NEXT:    v_or3_b32 v6, v6, v19, v15
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v6, s[2:3]
 ; GFX9-NEXT:    v_and_or_b32 v2, v8, v2, v7
@@ -5030,29 +4985,33 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v9, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v10, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v13, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v14, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v1, v1, s6, v5
+; GFX9-NEXT:    v_and_b32_sdwa v16, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX9-NEXT:    v_and_or_b32 v5, v2, s6, v0
-; GFX9-NEXT:    v_and_b32_sdwa v11, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v12, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v13, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v14, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v15, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v16, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v7
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v6
-; GFX9-NEXT:    v_or3_b32 v0, v1, v9, v10
-; GFX9-NEXT:    v_or3_b32 v1, v3, v11, v12
-; GFX9-NEXT:    v_or3_b32 v2, v4, v13, v14
-; GFX9-NEXT:    v_or3_b32 v3, v5, v15, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v7
+; GFX9-NEXT:    v_or3_b32 v0, v1, v13, v6
+; GFX9-NEXT:    v_or3_b32 v1, v3, v14, v8
+; GFX9-NEXT:    v_or3_b32 v2, v4, v15, v10
+; GFX9-NEXT:    v_or3_b32 v3, v5, v16, v11
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -5077,35 +5036,39 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 8, v6
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v14, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v15, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v5
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
-; GFX8-NEXT:    v_and_b32_sdwa v17, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v16
-; GFX8-NEXT:    v_and_b32_sdwa v18, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v19, v5, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v10, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v18
-; GFX8-NEXT:    v_and_b32_sdwa v6, v6, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v10
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v15
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 8, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v4
+; GFX8-NEXT:    v_and_b32_sdwa v18, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v19, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 24, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_sdwa v10, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v16, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 24, v6
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v15
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v10
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v12
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 24, v17
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v4, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[2:3]
 ; GFX8-NEXT:    v_and_b32_e32 v2, v6, v2
@@ -5115,34 +5078,38 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX8-NEXT:    v_and_b32_sdwa v15, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v14, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v7, v2, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v12
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v14
-; GFX8-NEXT:    v_or_b32_e32 v4, v1, v16
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v13
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v15
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v9
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v11
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v12
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -5187,8 +5154,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v7, s6, v7
-; GFX7-NEXT:    v_and_b32_e32 v10, s6, v10
 ; GFX7-NEXT:    v_and_b32_e32 v12, s6, v12
 ; GFX7-NEXT:    v_and_b32_e32 v14, s6, v14
 ; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
@@ -5198,7 +5163,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 24, v6
-; GFX7-NEXT:    v_and_b32_e32 v13, s6, v13
 ; GFX7-NEXT:    v_and_b32_e32 v15, s6, v15
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
@@ -5208,7 +5172,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; GFX7-NEXT:    v_or_b32_e32 v4, v5, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_and_b32_e32 v16, s6, v16
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v12
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
@@ -5232,16 +5195,10 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v2, s6, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s6, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, s6, v7
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
@@ -5251,8 +5208,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v2, s6, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s6, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v9
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
@@ -5264,8 +5220,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v3, s6, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, s6, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
@@ -5273,12 +5228,16 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v3, s6, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v13
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, s6, v5
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm
@@ -5305,31 +5264,35 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v6
-; GFX9-NEXT:    v_and_b32_sdwa v10, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v11, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v2, v3, s6, v2
-; GFX9-NEXT:    v_and_b32_sdwa v12, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v13, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v3, v4, s6, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v14, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v15, v5, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v4, v5, s6, v8
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or3_b32 v2, v2, v10, v11
-; GFX9-NEXT:    v_or3_b32 v3, v3, v12, v13
-; GFX9-NEXT:    v_and_b32_sdwa v16, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v17, v6, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v5, v6, s6, v9
-; GFX9-NEXT:    v_or3_b32 v4, v4, v14, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 8, v6
+; GFX9-NEXT:    v_and_b32_sdwa v14, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v2, v3, s6, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_and_b32_sdwa v15, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_and_or_b32 v3, v4, s6, v8
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 24, v6
+; GFX9-NEXT:    v_and_b32_sdwa v16, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s6, v10
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or3_b32 v2, v2, v14, v7
+; GFX9-NEXT:    v_or3_b32 v3, v3, v15, v9
+; GFX9-NEXT:    v_and_b32_sdwa v17, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v5, v6, s6, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; GFX9-NEXT:    v_or3_b32 v4, v4, v16, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v3, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
-; GFX9-NEXT:    v_or3_b32 v5, v5, v16, v17
+; GFX9-NEXT:    v_or3_b32 v5, v5, v17, v13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v4, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[2:3]
 ; GFX9-NEXT:    v_and_or_b32 v1, v6, s5, v1
@@ -5339,29 +5302,33 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[2:3]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v9, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v10, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_b32_sdwa v13, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v14, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v15, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v2, v2, s6, v5
+; GFX9-NEXT:    v_and_b32_sdwa v16, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; GFX9-NEXT:    v_and_or_b32 v5, v1, s6, v0
-; GFX9-NEXT:    v_and_b32_sdwa v11, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v12, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v13, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v14, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v15, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v16, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v6
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v7
-; GFX9-NEXT:    v_or3_b32 v0, v2, v9, v10
-; GFX9-NEXT:    v_or3_b32 v1, v3, v11, v12
-; GFX9-NEXT:    v_or3_b32 v2, v4, v13, v14
-; GFX9-NEXT:    v_or3_b32 v3, v5, v15, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX9-NEXT:    v_and_or_b32 v3, v3, s6, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s6, v9
+; GFX9-NEXT:    v_or3_b32 v0, v2, v13, v6
+; GFX9-NEXT:    v_or3_b32 v1, v3, v14, v8
+; GFX9-NEXT:    v_or3_b32 v2, v4, v15, v10
+; GFX9-NEXT:    v_or3_b32 v3, v5, v16, v11
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -5386,34 +5353,38 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v9, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 24, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v11, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v6
+; GFX8-NEXT:    v_and_b32_sdwa v16, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v12
-; GFX8-NEXT:    v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v14
-; GFX8-NEXT:    v_and_b32_sdwa v16, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v17, v5, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v16
-; GFX8-NEXT:    v_and_b32_sdwa v18, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v13
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v15
-; GFX8-NEXT:    v_and_b32_sdwa v19, v6, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v18
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
+; GFX8-NEXT:    v_and_b32_sdwa v17, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX8-NEXT:    v_and_b32_sdwa v18, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 24, v6
+; GFX8-NEXT:    v_and_b32_sdwa v19, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 24, v15
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v19
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v14
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v4, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[2:3]
 ; GFX8-NEXT:    v_and_b32_e32 v6, s5, v6
@@ -5424,34 +5395,38 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX8-NEXT:    v_and_b32_sdwa v15, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v14, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v10, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v11, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v7, v2, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v12
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v14
-; GFX8-NEXT:    v_or_b32_e32 v4, v1, v16
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v13
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v15
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX8-NEXT:    v_or_b32_e32 v4, v1, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v9
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v11
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v12
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -5495,8 +5470,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 24, v5
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v7, s6, v7
-; GFX7-NEXT:    v_and_b32_e32 v10, s6, v10
 ; GFX7-NEXT:    v_and_b32_e32 v12, s6, v12
 ; GFX7-NEXT:    v_and_b32_e32 v14, s6, v14
 ; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
@@ -5506,7 +5479,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 24, v6
-; GFX7-NEXT:    v_and_b32_e32 v13, s6, v13
 ; GFX7-NEXT:    v_and_b32_e32 v15, s6, v15
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
@@ -5516,7 +5488,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; GFX7-NEXT:    v_or_b32_e32 v4, v5, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX7-NEXT:    v_and_b32_e32 v16, s6, v16
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v12
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
@@ -5541,27 +5512,20 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, s6, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s6, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v7
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, s6, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s6, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, s6, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s6, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, s6, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v9
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
@@ -5573,8 +5537,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v3, s6, v11
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, s6, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v12
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
@@ -5582,12 +5545,16 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v3, s6, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v13
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, s6, v5
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, s6, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, s6, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm
@@ -5616,29 +5583,33 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 8, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v5
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v9, s0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v13, v4, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v14, v4, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v4, v4, s1, v9
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v15, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v16, v5, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v5, v5, s1, v10
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v17, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v18, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v6, v6, s1, v11
-; GFX9-NEXT:    v_and_b32_sdwa v19, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_or_b32 v9, v7, v0, v12
-; GFX9-NEXT:    v_and_b32_sdwa v7, v7, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v4, v4, v13, v14
-; GFX9-NEXT:    v_or3_b32 v5, v5, v15, v16
-; GFX9-NEXT:    v_or3_b32 v7, v9, v19, v7
-; GFX9-NEXT:    v_or3_b32 v6, v6, v17, v18
+; GFX9-NEXT:    v_and_b32_sdwa v17, v4, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v18, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v5, v5, s1, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 24, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 24, v7
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v13, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v15, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_and_or_b32 v4, v4, s1, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; GFX9-NEXT:    v_and_b32_sdwa v19, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_or_b32 v6, v6, s1, v13
+; GFX9-NEXT:    v_and_or_b32 v9, v7, v0, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v14
+; GFX9-NEXT:    v_and_b32_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 24, v16
+; GFX9-NEXT:    v_or3_b32 v4, v4, v17, v10
+; GFX9-NEXT:    v_or3_b32 v5, v5, v18, v11
+; GFX9-NEXT:    v_or3_b32 v7, v9, v7, v13
+; GFX9-NEXT:    v_or3_b32 v6, v6, v19, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v4, v5, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[0:1]
@@ -5648,31 +5619,35 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX9-NEXT:    v_and_b32_sdwa v15, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v16, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_b32_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v13, v4, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v14, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v15, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_or_b32 v4, v4, v0, v7
-; GFX9-NEXT:    v_and_or_b32 v5, v5, v0, v8
-; GFX9-NEXT:    v_and_b32_sdwa v10, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v11, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX9-NEXT:    v_and_or_b32 v4, v4, v0, v8
+; GFX9-NEXT:    v_and_or_b32 v5, v5, v0, v10
+; GFX9-NEXT:    v_and_b32_sdwa v14, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v6
 ; GFX9-NEXT:    v_and_or_b32 v6, v2, v0, v1
-; GFX9-NEXT:    v_and_b32_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v17, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_or3_b32 v1, v4, v12, v13
-; GFX9-NEXT:    v_or3_b32 v2, v5, v14, v15
+; GFX9-NEXT:    v_and_b32_sdwa v17, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX9-NEXT:    v_or3_b32 v1, v4, v15, v9
+; GFX9-NEXT:    v_or3_b32 v2, v5, v16, v11
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_or3_b32 v0, v3, v10, v11
-; GFX9-NEXT:    v_or3_b32 v3, v6, v16, v17
+; GFX9-NEXT:    v_or3_b32 v0, v3, v14, v7
+; GFX9-NEXT:    v_or3_b32 v3, v6, v17, v12
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
@@ -5696,35 +5671,39 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v10
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 8, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 8, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v7
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v12, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_and_b32_sdwa v15, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v16, v4, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v17, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 8, v6
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v13, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v14, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v15
-; GFX8-NEXT:    v_and_b32_sdwa v9, v5, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v17
-; GFX8-NEXT:    v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v11, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v18
-; GFX8-NEXT:    v_and_b32_sdwa v7, v7, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v6, v6, v11
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 8, v7
+; GFX8-NEXT:    v_and_b32_sdwa v19, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v5
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v9, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v15, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 24, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 24, v14
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v9
+; GFX8-NEXT:    v_and_b32_sdwa v11, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v17, v8, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 24, v7
+; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 24, v16
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v13
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 24, v18
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v4, vcc
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v15
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
 ; GFX8-NEXT:    v_and_b32_e32 v3, v7, v3
@@ -5734,34 +5713,38 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX8-NEXT:    v_and_b32_sdwa v10, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v11, v1, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v12, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v13, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v14, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v15, v4, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v17, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v6, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v14
-; GFX8-NEXT:    v_or_b32_e32 v4, v2, v16
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v10
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v12
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v15
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v8, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX8-NEXT:    v_and_b32_sdwa v14, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v15, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v16, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v14
+; GFX8-NEXT:    v_or_b32_e32 v5, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v4, v4, v16
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v15
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v9
+; GFX8-NEXT:    v_or_b32_e32 v2, v4, v11
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v12
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v13
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -5799,15 +5782,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 8, v7
 ; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v11, s0, v11
-; GFX7-NEXT:    v_and_b32_e32 v13, v13, v8
+; GFX7-NEXT:    v_and_b32_e32 v13, s0, v13
 ; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX7-NEXT:    v_and_b32_e32 v5, s0, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 24, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v9, s0, v9
-; GFX7-NEXT:    v_and_b32_e32 v12, s0, v12
 ; GFX7-NEXT:    v_and_b32_e32 v14, v14, v8
 ; GFX7-NEXT:    v_and_b32_e32 v16, v16, v8
 ; GFX7-NEXT:    v_and_b32_e32 v6, s0, v6
@@ -5817,7 +5798,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
 ; GFX7-NEXT:    v_or_b32_e32 v4, v5, v10
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 24, v7
-; GFX7-NEXT:    v_and_b32_e32 v15, v15, v8
 ; GFX7-NEXT:    v_and_b32_e32 v17, v17, v8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
@@ -5827,7 +5807,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX7-NEXT:    v_or_b32_e32 v5, v6, v13
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GFX7-NEXT:    v_and_b32_e32 v18, v18, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 24, v15
 ; GFX7-NEXT:    v_or_b32_e32 v4, v5, v14
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
@@ -5852,16 +5831,10 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v2, v2, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, v6, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, v7, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
@@ -5871,8 +5844,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v2, v9, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, v10, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v10
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
@@ -5884,8 +5856,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v3, v12, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, v13, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v13
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
@@ -5893,12 +5864,16 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
 ; GFX7-NEXT:    v_and_b32_e32 v3, v4, v8
 ; GFX7-NEXT:    v_and_b32_e32 v4, v14, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, v5, v8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, v15, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, v16, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 24, v16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index ea2631cbcb29..0b9a514d2398 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -46,8 +46,8 @@ define i8 @v_lshr_i8_7(i8 %value) {
 ; GFX9-LABEL: v_lshr_i8_7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 7
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 7
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i8 %value, 7
   ret i8 %result
@@ -557,13 +557,11 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ;
 ; GFX8-LABEL: s_lshr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
-; GFX8-NEXT:    s_and_b32 s4, s4, s3
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshr_b32 s1, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
@@ -740,21 +738,17 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ;
 ; GFX8-LABEL: s_lshr_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s6, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s6
 ; GFX8-NEXT:    s_and_b32 s2, s2, s6
-; GFX8-NEXT:    s_and_b32 s4, s4, s6
-; GFX8-NEXT:    s_and_b32 s7, s7, s6
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshr_b32 s2, s4, s7
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s6
 ; GFX8-NEXT:    s_and_b32 s3, s3, s6
-; GFX8-NEXT:    s_and_b32 s5, s5, s6
-; GFX8-NEXT:    s_and_b32 s8, s8, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX8-NEXT:    s_lshr_b32 s3, s5, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
@@ -932,39 +926,31 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ;
 ; GFX8-LABEL: s_lshr_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s12, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s13, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s14, s5, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s12
 ; GFX8-NEXT:    s_and_b32 s4, s4, s12
-; GFX8-NEXT:    s_and_b32 s8, s8, s12
-; GFX8-NEXT:    s_and_b32 s13, s13, s12
-; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s15, s6, 16
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s8, s13
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s14, s5, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s12
 ; GFX8-NEXT:    s_and_b32 s5, s5, s12
-; GFX8-NEXT:    s_and_b32 s9, s9, s12
-; GFX8-NEXT:    s_and_b32 s14, s14, s12
-; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s16, s7, 16
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s5
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s15, s6, 16
 ; GFX8-NEXT:    s_and_b32 s2, s2, s12
 ; GFX8-NEXT:    s_and_b32 s6, s6, s12
-; GFX8-NEXT:    s_and_b32 s10, s10, s12
-; GFX8-NEXT:    s_and_b32 s15, s15, s12
 ; GFX8-NEXT:    s_lshr_b32 s5, s9, s14
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s12
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s6
+; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
+; GFX8-NEXT:    s_lshr_b32 s16, s7, 16
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    s_and_b32 s3, s3, s12
 ; GFX8-NEXT:    s_and_b32 s7, s7, s12
-; GFX8-NEXT:    s_and_b32 s11, s11, s12
-; GFX8-NEXT:    s_and_b32 s16, s16, s12
 ; GFX8-NEXT:    s_lshr_b32 s6, s10, s15
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s12

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
new file mode 100644
index 000000000000..194d52e19b82
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
@@ -0,0 +1,22 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:  remove_and_255_zextload
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: remove_and_255_zextload
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK: %ptr:_(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: %load:_(s32) = G_ZEXTLOAD %ptr(p1) :: (load 1, addrspace 1)
+    ; CHECK: $vgpr0 = COPY %load(s32)
+    %ptr:_(p1) = COPY $vgpr0_vgpr1
+    %load:_(s32) = G_ZEXTLOAD %ptr :: (load 1, addrspace 1, align 1)
+    %mask:_(s32) = G_CONSTANT i32 255
+    %and:_(s32) = G_AND %load, %mask
+    $vgpr0 = COPY %and
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 95a32223ac3f..b2f3dd8b2bf4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -461,7 +461,6 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0x3fff3fff
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX7-NEXT:    s_and_b32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s1, s1, s2
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -473,7 +472,6 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s3, s2
 ; GFX8-NEXT:    s_and_b32 s0, s0, s4
-; GFX8-NEXT:    s_and_b32 s1, s1, s4
 ; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_mov_b32 s5, s4
 ; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
@@ -485,9 +483,7 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0x3fff3fff
 ; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
-; GFX9-NEXT:    s_and_b32 s0, s0, s2
-; GFX9-NEXT:    s_and_b32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -506,14 +502,13 @@ define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) {
 ; GFX7-LABEL: v_shl_v2i32_zext_v2i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x3fff3fff, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -521,24 +516,19 @@ define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) {
 ; GFX8-LABEL: v_shl_v2i32_zext_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x3fff3fff, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x3fff3fff, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_v2i32_zext_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x3fff3fff, v0
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s4, 2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x3fff3fff, v0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %and = and <2 x i16> %x, <i16 16383, i16 16383>
   %ext = zext <2 x i16> %and to <2 x i32>

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 015f6b5de8b0..f58e26604529 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -551,13 +551,11 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun
 ;
 ; GFX8-LABEL: s_shl_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s3, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s3
 ; GFX8-NEXT:    s_and_b32 s1, s1, s3
-; GFX8-NEXT:    s_and_b32 s2, s2, s3
-; GFX8-NEXT:    s_and_b32 s4, s4, s3
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
@@ -722,21 +720,17 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ;
 ; GFX8-LABEL: s_shl_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s6, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s6
 ; GFX8-NEXT:    s_and_b32 s2, s2, s6
-; GFX8-NEXT:    s_and_b32 s4, s4, s6
-; GFX8-NEXT:    s_and_b32 s7, s7, s6
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshl_b32 s2, s4, s7
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s6
 ; GFX8-NEXT:    s_and_b32 s3, s3, s6
-; GFX8-NEXT:    s_and_b32 s5, s5, s6
-; GFX8-NEXT:    s_and_b32 s8, s8, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s3
 ; GFX8-NEXT:    s_lshl_b32 s3, s5, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
@@ -898,39 +892,31 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ;
 ; GFX8-LABEL: s_shl_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s12, 0xffff
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s13, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s14, s5, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s12
 ; GFX8-NEXT:    s_and_b32 s4, s4, s12
-; GFX8-NEXT:    s_and_b32 s8, s8, s12
-; GFX8-NEXT:    s_and_b32 s13, s13, s12
-; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s15, s6, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshl_b32 s4, s8, s13
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s14, s5, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s12
 ; GFX8-NEXT:    s_and_b32 s5, s5, s12
-; GFX8-NEXT:    s_and_b32 s9, s9, s12
-; GFX8-NEXT:    s_and_b32 s14, s14, s12
-; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s16, s7, 16
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s5
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s15, s6, 16
 ; GFX8-NEXT:    s_and_b32 s2, s2, s12
 ; GFX8-NEXT:    s_and_b32 s6, s6, s12
-; GFX8-NEXT:    s_and_b32 s10, s10, s12
-; GFX8-NEXT:    s_and_b32 s15, s15, s12
 ; GFX8-NEXT:    s_lshl_b32 s5, s9, s14
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, s12
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s6
+; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
+; GFX8-NEXT:    s_lshr_b32 s16, s7, 16
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    s_and_b32 s3, s3, s12
 ; GFX8-NEXT:    s_and_b32 s7, s7, s12
-; GFX8-NEXT:    s_and_b32 s11, s11, s12
-; GFX8-NEXT:    s_and_b32 s16, s16, s12
 ; GFX8-NEXT:    s_lshl_b32 s6, s10, s15
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
 ; GFX8-NEXT:    s_and_b32 s1, s1, s12

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 6814f5bb1843..8c2d18406f47 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -205,10 +205,7 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_min_u32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -223,10 +220,9 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
 ; GFX8-NEXT:    v_add_u16_e64 v1, v3, v2 clamp
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v2i8:
@@ -291,10 +287,7 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_cmp_lt_u32 s3, s2
 ; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
-; GFX6-NEXT:    s_movk_i32 s2, 0xff
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX6-NEXT:    s_and_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -311,11 +304,10 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX8-NEXT:    v_add_u16_e64 v1, s0, v1 clamp
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -399,24 +391,19 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_min_u32_e32 v3, v5, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_min_u32_e32 v4, v5, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -565,17 +552,12 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_cmp_lt_u32 s5, s4
 ; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s3, s4
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1892,10 +1874,7 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-NEXT:    s_cmp_lt_u32 s3, s2
 ; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1946,10 +1925,7 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_u32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1994,10 +1970,7 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-NEXT:    v_min_u32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2063,19 +2036,14 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v3
 ; GFX6-NEXT:    v_min_u32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v4i16:
@@ -2142,16 +2110,11 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_cmp_lt_u32 s5, s4
 ; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s4
-; GFX6-NEXT:    s_and_b32 s2, s3, s4
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_or_b32 s1, s1, s2
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_uaddsat_v4i16:
@@ -2241,29 +2204,22 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_u32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
 ; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_u32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v6i16:
@@ -2351,20 +2307,13 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_cmp_lt_u32 s7, s6
 ; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
 ; GFX6-NEXT:    s_add_i32 s5, s5, s6
-; GFX6-NEXT:    s_mov_b32 s6, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s1, s6
-; GFX6-NEXT:    s_and_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s6
-; GFX6-NEXT:    s_and_b32 s2, s3, s6
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s6
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s4, s6
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
+; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_uaddsat_v6i16:
@@ -2466,36 +2415,27 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v6
 ; GFX6-NEXT:    v_min_u32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
 ; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_u32_e32 v8, v9, v8
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v6, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v8i16:
@@ -2603,24 +2543,15 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_cmp_lt_u32 s9, s8
 ; GFX6-NEXT:    s_cselect_b32 s8, s9, s8
 ; GFX6-NEXT:    s_add_i32 s7, s7, s8
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s1, s8
-; GFX6-NEXT:    s_and_b32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s8
-; GFX6-NEXT:    s_and_b32 s2, s3, s8
-; GFX6-NEXT:    s_and_b32 s3, s5, s8
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s7, 16
-; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s4, s8
-; GFX6-NEXT:    s_and_b32 s4, s7, s8
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s3, s6, s8
-; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_or_b32 s3, s3, s4
+; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
+; GFX6-NEXT:    s_or_b32 s2, s4, s2
+; GFX6-NEXT:    s_or_b32 s3, s6, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_uaddsat_v8i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index b1a1a313f7e6..6dabfc3c495b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -199,10 +199,7 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_min_u32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -217,10 +214,9 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
 ; GFX8-NEXT:    v_sub_u16_e64 v1, v3, v2 clamp
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i8:
@@ -283,10 +279,7 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_cmp_lt_u32 s1, s2
 ; GFX6-NEXT:    s_cselect_b32 s2, s1, s2
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_movk_i32 s2, 0xff
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX6-NEXT:    s_and_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -303,11 +296,10 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX8-NEXT:    v_sub_u16_e64 v1, s0, v1 clamp
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -386,25 +378,20 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
 ; GFX6-NEXT:    v_min_u32_e32 v3, v2, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_min_u32_e32 v4, v3, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -549,17 +536,12 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_cmp_lt_u32 s3, s4
 ; GFX6-NEXT:    s_cselect_b32 s4, s3, s4
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
-; GFX6-NEXT:    s_movk_i32 s4, 0xff
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s3, s4
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1802,10 +1784,7 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-NEXT:    s_cmp_lt_u32 s1, s2
 ; GFX6-NEXT:    s_cselect_b32 s2, s1, s2
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1854,10 +1833,7 @@ define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_u32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1900,10 +1876,7 @@ define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-NEXT:    v_min_u32_e32 v2, s0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -1965,19 +1938,14 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_min_u32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v4i16:
@@ -2040,16 +2008,11 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_cmp_lt_u32 s3, s4
 ; GFX6-NEXT:    s_cselect_b32 s4, s3, s4
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s1, s4
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s4
-; GFX6-NEXT:    s_and_b32 s2, s3, s4
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_or_b32 s1, s1, s2
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_v4i16:
@@ -2133,29 +2096,22 @@ define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
 ; GFX6-NEXT:    v_min_u32_e32 v6, v4, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_u32_e32 v6, v5, v6
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v6i16:
@@ -2237,20 +2193,13 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_cmp_lt_u32 s5, s6
 ; GFX6-NEXT:    s_cselect_b32 s6, s5, s6
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s6
-; GFX6-NEXT:    s_mov_b32 s6, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s1, s6
-; GFX6-NEXT:    s_and_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s6
-; GFX6-NEXT:    s_and_b32 s2, s3, s6
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, s6
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s4, s6
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
+; GFX6-NEXT:    s_or_b32 s2, s4, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_v6i16:
@@ -2344,36 +2293,27 @@ define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
 ; GFX6-NEXT:    v_min_u32_e32 v8, v6, v8
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_u32_e32 v8, v7, v8
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_and_b32_e32 v3, s4, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v6, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v8i16:
@@ -2473,24 +2413,15 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_cmp_lt_u32 s7, s8
 ; GFX6-NEXT:    s_cselect_b32 s8, s7, s8
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s8
-; GFX6-NEXT:    s_mov_b32 s8, 0xffff
-; GFX6-NEXT:    s_and_b32 s1, s1, s8
-; GFX6-NEXT:    s_and_b32 s0, s0, s8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s8
-; GFX6-NEXT:    s_and_b32 s2, s3, s8
-; GFX6-NEXT:    s_and_b32 s3, s5, s8
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s7, 16
-; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s2, s4, s8
-; GFX6-NEXT:    s_and_b32 s4, s7, s8
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s3, s6, s8
-; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX6-NEXT:    s_or_b32 s3, s3, s4
+; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
+; GFX6-NEXT:    s_or_b32 s1, s2, s1
+; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
+; GFX6-NEXT:    s_or_b32 s2, s4, s2
+; GFX6-NEXT:    s_or_b32 s3, s6, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_v8i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index 41dc973c0258..4edc231fc141 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -37,7 +37,6 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_mov_b32 s3, s2
 ; GFX8-NEXT:    s_and_b32 s0, s0, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
@@ -118,21 +117,19 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
 ; GFX8-NEXT:    s_and_b32 s2, s0, s4
-; GFX8-NEXT:    s_and_b32 s0, s1, s4
-; GFX8-NEXT:    s_and_b32 s1, s5, s4
 ; GFX8-NEXT:    s_mov_b32 s5, s4
-; GFX8-NEXT:    s_and_b32 s3, s3, s4
-; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
-; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX8-NEXT:    s_and_b32 s6, s1, s4
+; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[4:5]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[2:3], s[6:7], s[4:5]
 ; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX8-NEXT:    s_xor_b64 s[6:7], s[0:1], s[4:5]
-; GFX8-NEXT:    s_and_b32 s1, s2, s4
-; GFX8-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_lshl_b32 s1, s7, 16
-; GFX8-NEXT:    s_and_b32 s2, s6, s4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, s0, s4
+; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX8-NEXT:    s_and_b32 s2, s2, s4
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;


        


More information about the llvm-commits mailing list