[llvm] 78739fd - [DAG] Enable combineShiftOfShiftedLogic folds after type legalization

Sat Oct 29 04:30:15 PDT 2022

Author: Simon Pilgrim
Date: 2022-10-29T12:30:04+01:00
New Revision: 78739fdb4d846c068ddd935d4cbd0474ff5fc6a9

URL: https://github.com/llvm/llvm-project/commit/78739fdb4d846c068ddd935d4cbd0474ff5fc6a9
DIFF: https://github.com/llvm/llvm-project/commit/78739fdb4d846c068ddd935d4cbd0474ff5fc6a9.diff

LOG: [DAG] Enable combineShiftOfShiftedLogic folds after type legalization

This was disabled to prevent regressions, which appear to be just occurring on AMDGPU (at least in our current lit tests), which I've addressed by adding AMDGPUTargetLowering::isDesirableToCommuteWithShift overrides.

Fixes #57872

Differential Revision: https://reviews.llvm.org/D136042

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/BPF/pr57872.ll
    llvm/test/CodeGen/Mips/cconv/return-struct.ll
    llvm/test/CodeGen/Mips/cconv/vector.ll
    llvm/test/CodeGen/Mips/load-store-left-right.ll
    llvm/test/CodeGen/Mips/unalignedload.ll
    llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
    llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/RISCV/unaligned-load-store.ll
    llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
    llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/X86/bool-vector.ll
    llvm/test/CodeGen/X86/combine-bitreverse.ll
    llvm/test/CodeGen/X86/is_fpclass.ll
    llvm/test/CodeGen/X86/vector-sext.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c402c2872afda..7d139fce7f758 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8877,13 +8877,9 @@ SDValue DAGCombiner::visitShiftByConstant(SDNode *N) {
   if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level))
     return SDValue();
 
-  // TODO: This is limited to early combining because it may reveal regressions
-  //       otherwise. But since we just checked a target hook to see if this is
-  //       desirable, that should have filtered out cases where this interferes
-  //       with some other pattern matching.
-  if (!LegalTypes)
-    if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
-      return R;
+  // Fold shift(bitop(shift(x,c1),y), c2) -> bitop(shift(x,c1+c2),shift(y,c2)).
+  if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
+    return R;
 
   // We want to pull some binops through shifts, so that we have (and (shift))
   // instead of (shift (and)), likewise for add, or, xor, etc.  This sort of

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d3e65516a526a..a908dc34e7567 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -839,6 +839,39 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
 }
 
+bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
+    const SDNode* N, CombineLevel Level) const {
+  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
+          N->getOpcode() == ISD::SRL) &&
+         "Expected shift op");
+  // Always commute pre-type legalization and right shifts.
+  // We're looking for shl(or(x,y),z) patterns.
+  if (Level < CombineLevel::AfterLegalizeTypes ||
+      N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
+    return true;
+
+  // If only user is a i32 right-shift, then don't destroy a BFE pattern.
+  if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
+      (N->use_begin()->getOpcode() == ISD::SRA ||
+       N->use_begin()->getOpcode() == ISD::SRL))
+    return false;
+
+  // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
+  auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
+    if (LHS.getOpcode() != ISD::SHL)
+      return false;
+    auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
+    auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
+    auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+    return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
+           LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
+           RHSLd->getExtensionType() == ISD::ZEXTLOAD;
+  };
+  SDValue LHS = N->getOperand(0).getOperand(0);
+  SDValue RHS = N->getOperand(0).getOperand(1);
+  return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
+}
+
 //===---------------------------------------------------------------------===//
 // TargetLowering Callbacks
 //===---------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 11ee9f9ff0dd5..619f9aff46a15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -173,6 +173,9 @@ class AMDGPUTargetLowering : public TargetLowering {
 
   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
+  bool isDesirableToCommuteWithShift(const SDNode *N,
+                                     CombineLevel Level) const override;
+
   EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
                           ISD::NodeType ExtendKind) const override;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 6a9cff4181434..1ada8947b6f56 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1448,24 +1448,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; SI-NEXT:    v_and_b32_e32 v7, 0xff00, v4
 ; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
 ; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 9, v4
+; SI-NEXT:    v_and_b32_e32 v6, 0xff00, v4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, 0xff, v4
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 9, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
-; SI-NEXT:    v_or_b32_e32 v0, v7, v0
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v5
+; SI-NEXT:    v_or_b32_e32 v0, v6, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_and_b32_e32 v4, 0xff000000, v4
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index a28baa21deb3e..b6f87a1f95636 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -151,26 +151,26 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 {
 ; GFX7-ALIGNED-LABEL: global_load_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX7-ALIGNED-NEXT:    flat_load_ubyte v4, v[2:3]
-; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
+; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
 ; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX7-ALIGNED-NEXT:    flat_load_ubyte v5, v[0:1]
+; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v6, vcc, 3, v0
+; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX7-ALIGNED-NEXT:    flat_load_ubyte v6, v[6:7]
+; GFX7-ALIGNED-NEXT:    flat_load_ubyte v4, v[4:5]
 ; GFX7-ALIGNED-NEXT:    flat_load_ubyte v2, v[2:3]
-; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
-; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-ALIGNED-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align1:

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index 8b6e1a65227fe..8ea3eb1480cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -208,19 +208,19 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v3, vcc, 3, v0
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v3, v3, s[0:3], 0 offen
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v0, v0, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v0, v0, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 07261fa697a8a..f47c20ac68325 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -2818,50 +2818,50 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_bfe_i32 v7, v2, 0, 4
-; GFX7-NEXT:    v_bfe_i32 v3, v2, 24, 4
+; GFX7-NEXT:    v_bfe_i32 v8, v2, 0, 4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 28, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_bfe_i32 v14, v0, 0, 4
-; GFX7-NEXT:    v_bfe_i32 v4, v2, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v6, v2, 8, 4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v8, 28, v2
+; GFX7-NEXT:    v_bfe_i32 v15, v0, 0, 4
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 24, 4
+; GFX7-NEXT:    v_bfe_i32 v5, v2, 20, 4
+; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
+; GFX7-NEXT:    v_bfe_i32 v7, v2, 8, 4
 ; GFX7-NEXT:    v_bfe_i32 v9, v2, 12, 4
 ; GFX7-NEXT:    v_bfe_i32 v2, v2, 4, 4
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX7-NEXT:    v_bfe_i32 v10, v0, 24, 4
-; GFX7-NEXT:    v_bfe_i32 v11, v0, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v12, v0, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v13, v0, 8, 4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v15, 28, v0
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX7-NEXT:    v_ashrrev_i32_e32 v10, 28, v0
+; GFX7-NEXT:    v_bfe_i32 v11, v0, 24, 4
+; GFX7-NEXT:    v_bfe_i32 v12, v0, 20, 4
+; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
+; GFX7-NEXT:    v_bfe_i32 v14, v0, 8, 4
 ; GFX7-NEXT:    v_bfe_i32 v16, v0, 12, 4
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 4, 4
-; GFX7-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v15
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xff, v14
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
 ; GFX7-NEXT:    v_alignbit_b32 v9, 0, v9, 24
 ; GFX7-NEXT:    v_alignbit_b32 v16, 0, v16, 24
-; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v10
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v15
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
 ; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 09c097b9813da..1969ac68efcfe 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -2444,40 +2444,36 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_and_b32_e32 v7, 15, v2
-; GFX7-NEXT:    v_bfe_u32 v6, v2, 4, 4
+; GFX7-NEXT:    v_and_b32_e32 v8, 15, v2
+; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v14, 15, v0
-; GFX7-NEXT:    v_bfe_u32 v8, v2, 12, 4
-; GFX7-NEXT:    v_bfe_u32 v13, v0, 4, 4
-; GFX7-NEXT:    v_bfe_u32 v15, v0, 12, 4
+; GFX7-NEXT:    v_and_b32_e32 v15, 15, v0
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
+; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
+; GFX7-NEXT:    v_bfe_u32 v6, v2, 8, 4
+; GFX7-NEXT:    v_bfe_u32 v7, v2, 4, 4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 12, v2
+; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
+; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
+; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
+; GFX7-NEXT:    v_bfe_u32 v13, v0, 8, 4
+; GFX7-NEXT:    v_bfe_u32 v14, v0, 4, 4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 28, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 12, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xf000000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xf000000, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
-; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 4
-; GFX7-NEXT:    v_bfe_u32 v12, v0, 8, 4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 24, v15
+; GFX7-NEXT:    v_alignbit_b32 v2, s10, v2, 24
+; GFX7-NEXT:    v_alignbit_b32 v0, 0, v0, 24
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
-; GFX7-NEXT:    v_alignbit_b32 v8, 0, v8, 24
-; GFX7-NEXT:    v_alignbit_b32 v14, 0, v15, 24
-; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
-; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v2
-; GFX7-NEXT:    v_bfe_u32 v11, v0, 16, 4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 28, v0
-; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v14, v1
-; GFX7-NEXT:    v_bfe_u32 v3, v2, 20, 4
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 20, 4
-; GFX7-NEXT:    v_alignbit_b32 v2, v9, v2, 24
-; GFX7-NEXT:    v_alignbit_b32 v0, v16, v0, 24
-; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v7, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
 ; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index f27c6800c69e4..54111a4e1a09d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -1060,17 +1060,18 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    s_lshr_b32 s42, s7, 22
 ; GCN-NEXT:    s_lshr_b32 s43, s7, 23
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x77
-; GCN-NEXT:    v_mov_b32_e32 v16, s43
+; GCN-NEXT:    v_mov_b32_e32 v14, s43
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x76
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v17, s42
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x75
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    v_or_b32_e32 v14, v14, v17
 ; GCN-NEXT:    v_mov_b32_e32 v17, s41
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x74
@@ -1081,10 +1082,9 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x73
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    v_or_b32_e32 v14, v17, v14
 ; GCN-NEXT:    v_mov_b32_e32 v17, s39
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x72
@@ -1092,10 +1092,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_mov_b32_e32 v18, s38
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x71
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
 ; GCN-NEXT:    v_mov_b32_e32 v18, s37
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x70
@@ -1106,13 +1107,12 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
 ; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7f
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    v_or_b32_e32 v14, v17, v14
 ; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s35
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7e
@@ -1120,10 +1120,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7d
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
 ; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s35
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7c
@@ -1134,7 +1135,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7b
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
@@ -1145,81 +1145,82 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x78
-; GCN-NEXT:    v_mov_b32_e32 v14, s35
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x78
+; GCN-NEXT:    v_mov_b32_e32 v12, s35
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x79
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
 ; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v12, 1, v12, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
+; GCN-NEXT:    v_and_b32_e32 v12, 1, v12
 ; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
-; GCN-NEXT:    v_and_b32_e32 v14, 3, v14
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
-; GCN-NEXT:    v_and_b32_e32 v14, 15, v14
-; GCN-NEXT:    v_or_b32_sdwa v14, v14, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v12, v12, v19
+; GCN-NEXT:    v_and_b32_e32 v12, 3, v12
+; GCN-NEXT:    v_or_b32_e32 v18, v12, v18
+; GCN-NEXT:    v_mov_b32_e32 v12, 15
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 12, v17
+; GCN-NEXT:    v_and_b32_sdwa v18, v18, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6f
-; GCN-NEXT:    v_or_b32_sdwa v14, v16, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 15, s7
+; GCN-NEXT:    v_or_b32_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 15, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6e
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6d
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6c
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6b
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 11, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6a
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 10, s7
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 14, s7
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x69
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 9, s7
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6d
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 13, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x68
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 8, s7
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6c
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 12, s7
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6b
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 11, s7
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6a
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 10, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x69
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
+; GCN-NEXT:    v_lshrrev_b16_e64 v19, 9, s7
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x68
+; GCN-NEXT:    v_lshrrev_b16_e64 v16, 8, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
+; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v19
+; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 12, v17
+; GCN-NEXT:    v_and_b32_sdwa v16, v16, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x67
-; GCN-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
 ; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x66
@@ -1227,10 +1228,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x65
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
 ; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x64
@@ -1241,7 +1243,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x63
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
@@ -1252,10 +1253,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x61
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
 ; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x60
@@ -1266,7 +1268,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
 ; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
 ; GCN-NEXT:    v_or_b32_e32 v15, v15, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
 ; GCN-NEXT:    v_or_b32_e32 v15, v15, v18
 ; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
@@ -1281,10 +1282,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_mov_b32_e32 v17, s33
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x55
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
 ; GCN-NEXT:    v_mov_b32_e32 v17, s31
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x54
@@ -1295,7 +1297,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x53
 ; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
@@ -1306,10 +1307,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_mov_b32_e32 v18, s28
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x51
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
 ; GCN-NEXT:    v_mov_b32_e32 v18, s27
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x50
@@ -1320,7 +1322,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
@@ -1334,10 +1335,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5d
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
 ; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s25
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5c
@@ -1348,7 +1350,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5b
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
@@ -1359,13 +1360,14 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x58
 ; GCN-NEXT:    v_mov_b32_e32 v3, s25
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x59
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
 ; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s25
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -1373,12 +1375,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
 ; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
 ; GCN-NEXT:    v_or_b32_e32 v3, v3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
-; GCN-NEXT:    v_and_b32_e32 v3, 15, v3
-; GCN-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 12, v17
+; GCN-NEXT:    v_and_b32_sdwa v3, v3, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4f
 ; GCN-NEXT:    v_or_b32_sdwa v16, v16, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_lshrrev_b16_e64 v3, 15, s6
@@ -1388,10 +1389,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4d
-; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v17
 ; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4c
@@ -1402,7 +1404,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
 ; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4b
 ; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
@@ -1413,10 +1414,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x49
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
 ; GCN-NEXT:    v_lshrrev_b16_e64 v18, 9, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x48
@@ -1427,13 +1429,12 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
 ; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 4, v3
-; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v3, 12, v3
+; GCN-NEXT:    v_and_b32_sdwa v17, v17, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x47
-; GCN-NEXT:    v_or_b32_sdwa v17, v17, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v17, v3, v17
 ; GCN-NEXT:    v_lshrrev_b16_e64 v3, 7, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x46
@@ -1441,10 +1442,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x45
-; GCN-NEXT:    v_or_b32_e32 v3, v18, v3
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v18
 ; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x44
@@ -1455,7 +1457,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
 ; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x43
 ; GCN-NEXT:    v_or_b32_e32 v18, v18, v3
@@ -1466,10 +1467,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
+; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
 ; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x41
-; GCN-NEXT:    v_or_b32_e32 v3, v19, v3
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
 ; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s6
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 64
@@ -1480,7 +1482,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_or_b32_e32 v2, v2, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
 ; GCN-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GCN-NEXT:    v_or_b32_sdwa v3, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1495,11 +1496,12 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_mov_b32_e32 v15, s23
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 53
 ; GCN-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
+; GCN-NEXT:    v_or_b32_e32 v14, v14, v15
 ; GCN-NEXT:    v_mov_b32_e32 v15, s22
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 52
@@ -1511,7 +1513,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 51
 ; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
@@ -1522,10 +1523,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_mov_b32_e32 v16, s19
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 49
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
 ; GCN-NEXT:    v_mov_b32_e32 v16, s18
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 48
@@ -1536,7 +1538,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
 ; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
@@ -1550,10 +1551,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 61
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
 ; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s16
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 60
@@ -1564,7 +1566,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 59
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
@@ -1575,13 +1576,14 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 56
 ; GCN-NEXT:    v_mov_b32_e32 v13, s16
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 57
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
 ; GCN-NEXT:    v_lshrrev_b16_e64 v17, 1, s16
 ; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -1589,12 +1591,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
 ; GCN-NEXT:    v_or_b32_e32 v13, v13, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    v_and_b32_e32 v13, 3, v13
 ; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
-; GCN-NEXT:    v_and_b32_e32 v13, 15, v13
-; GCN-NEXT:    v_or_b32_sdwa v13, v13, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
+; GCN-NEXT:    v_and_b32_sdwa v13, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 47
 ; GCN-NEXT:    v_or_b32_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_lshrrev_b16_e64 v13, 15, s5
@@ -1604,10 +1605,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
 ; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v13, 3, v13
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 45
-; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
+; GCN-NEXT:    v_or_b32_e32 v13, v13, v15
 ; GCN-NEXT:    v_lshrrev_b16_e64 v15, 13, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 44
@@ -1618,7 +1620,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
 ; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 43
 ; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
@@ -1629,10 +1630,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 41
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
 ; GCN-NEXT:    v_lshrrev_b16_e64 v16, 9, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 40
@@ -1643,13 +1645,12 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 4, v13
-; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
+; GCN-NEXT:    v_and_b32_sdwa v15, v15, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 39
-; GCN-NEXT:    v_or_b32_sdwa v15, v15, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v15, v13, v15
 ; GCN-NEXT:    v_lshrrev_b16_e64 v13, 7, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 38
@@ -1657,10 +1658,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v13, 3, v13
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 37
-; GCN-NEXT:    v_or_b32_e32 v13, v16, v13
+; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
 ; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 36
@@ -1671,7 +1673,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 35
 ; GCN-NEXT:    v_or_b32_e32 v16, v16, v13
@@ -1682,10 +1683,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v13, 3, v13
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 33
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v13
+; GCN-NEXT:    v_or_b32_e32 v17, v13, v17
 ; GCN-NEXT:    v_lshrrev_b16_e64 v13, 1, s5
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 32
@@ -1696,7 +1698,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_or_b32_e32 v1, v1, v13
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
 ; GCN-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GCN-NEXT:    v_or_b32_e32 v1, v1, v17
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
@@ -1712,10 +1713,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_mov_b32_e32 v15, s14
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 21
-; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
+; GCN-NEXT:    v_or_b32_e32 v14, v14, v15
 ; GCN-NEXT:    v_mov_b32_e32 v15, s13
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 20
@@ -1726,7 +1728,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 19
 ; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
@@ -1737,10 +1738,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_mov_b32_e32 v16, s10
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 17
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
 ; GCN-NEXT:    v_mov_b32_e32 v16, s9
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 16
@@ -1751,7 +1753,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
 ; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
@@ -1765,10 +1766,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 29
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
 ; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s1
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 28
@@ -1779,7 +1781,6 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
 ; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 27
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
@@ -1790,13 +1791,14 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 24
 ; GCN-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 25
-; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v18
 ; GCN-NEXT:    v_lshrrev_b16_e64 v18, 1, s1
 ; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -1804,12 +1806,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
 ; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
 ; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
-; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
-; GCN-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
+; GCN-NEXT:    v_and_b32_sdwa v16, v16, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 15
 ; GCN-NEXT:    v_or_b32_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_lshrrev_b16_e64 v15, 15, s4
@@ -1819,10 +1820,11 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 13
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
 ; GCN-NEXT:    v_lshrrev_b16_e64 v16, 13, s4
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 12
@@ -1834,22 +1836,21 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 11
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 11, s4
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
+; GCN-NEXT:    v_lshrrev_b16_e64 v17, 11, s4
 ; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 10
-; GCN-NEXT:    v_lshrrev_b16_e64 v13, 10, s4
+; GCN-NEXT:    v_lshrrev_b16_e64 v18, 10, s4
 ; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v18, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v17, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 9
-; GCN-NEXT:    v_lshrrev_b16_e64 v12, 9, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
+; GCN-NEXT:    v_lshrrev_b16_e64 v13, 9, s4
+; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v18, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 8
 ; GCN-NEXT:    v_lshrrev_b16_e64 v11, 8, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v12, 1, v12, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 7
 ; GCN-NEXT:    v_lshrrev_b16_e64 v10, 7, s4
@@ -1884,38 +1885,38 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
-; GCN-NEXT:    v_lshlrev_b16_e32 v12, 1, v12
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
 ; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
-; GCN-NEXT:    v_lshlrev_b16_e32 v10, 1, v10
 ; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GCN-NEXT:    v_lshlrev_b16_e32 v8, 1, v8
 ; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
-; GCN-NEXT:    v_lshlrev_b16_e32 v6, 1, v6
 ; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GCN-NEXT:    v_lshlrev_b16_e32 v4, 1, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
-; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
-; GCN-NEXT:    v_or_b32_e32 v9, v9, v10
+; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
+; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
+; GCN-NEXT:    v_or_b32_e32 v11, v11, v13
+; GCN-NEXT:    v_lshlrev_b16_e32 v10, 3, v10
+; GCN-NEXT:    v_lshlrev_b16_e32 v9, 2, v9
 ; GCN-NEXT:    v_or_b32_e32 v7, v7, v8
-; GCN-NEXT:    v_or_b32_e32 v5, v5, v6
+; GCN-NEXT:    v_lshlrev_b16_e32 v6, 3, v6
+; GCN-NEXT:    v_lshlrev_b16_e32 v5, 2, v5
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v4
-; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
+; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
 ; GCN-NEXT:    v_and_b32_e32 v11, 3, v11
-; GCN-NEXT:    v_lshlrev_b16_e32 v9, 2, v9
+; GCN-NEXT:    v_or_b32_e32 v9, v10, v9
 ; GCN-NEXT:    v_and_b32_e32 v7, 3, v7
-; GCN-NEXT:    v_lshlrev_b16_e32 v5, 2, v5
+; GCN-NEXT:    v_or_b32_e32 v5, v6, v5
 ; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_or_b32_e32 v11, v11, v13
+; GCN-NEXT:    v_or_b32_e32 v11, v11, v16
 ; GCN-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v5
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
-; GCN-NEXT:    v_and_b32_e32 v11, 15, v11
+; GCN-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
+; GCN-NEXT:    v_and_b32_sdwa v11, v11, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    v_lshlrev_b16_e32 v7, 4, v7
 ; GCN-NEXT:    v_and_b32_e32 v0, 15, v0
-; GCN-NEXT:    v_or_b32_sdwa v11, v11, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GCN-NEXT:    v_or_b32_e32 v11, v15, v11
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v7
 ; GCN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_mov_b32_e32 v5, s3

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index f1b4399fad099..c47760a01547f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1213,13 +1213,13 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_lshr_b32 s4, s11, 24
 ; SI-NEXT:    s_cmp_lg_u32 s6, 15
 ; SI-NEXT:    s_cselect_b32 s4, s4, 5
-; SI-NEXT:    s_lshl_b32 s4, s4, 8
+; SI-NEXT:    s_lshl_b32 s4, s4, 24
 ; SI-NEXT:    s_lshr_b32 s5, s11, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 14
 ; SI-NEXT:    s_cselect_b32 s5, s5, 5
 ; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_lshr_b32 s5, s11, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 13
 ; SI-NEXT:    s_cselect_b32 s5, s5, 5
@@ -1233,13 +1233,13 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_lshr_b32 s5, s10, 24
 ; SI-NEXT:    s_cmp_lg_u32 s6, 11
 ; SI-NEXT:    s_cselect_b32 s5, s5, 5
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
+; SI-NEXT:    s_lshl_b32 s5, s5, 24
 ; SI-NEXT:    s_lshr_b32 s7, s10, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 10
 ; SI-NEXT:    s_cselect_b32 s7, s7, 5
 ; SI-NEXT:    s_and_b32 s7, s7, 0xff
-; SI-NEXT:    s_or_b32 s5, s7, s5
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_or_b32 s5, s5, s7
 ; SI-NEXT:    s_lshr_b32 s7, s10, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 9
 ; SI-NEXT:    s_cselect_b32 s7, s7, 5
@@ -1253,13 +1253,13 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_lshr_b32 s7, s9, 24
 ; SI-NEXT:    s_cmp_lg_u32 s6, 7
 ; SI-NEXT:    s_cselect_b32 s7, s7, 5
-; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    s_lshl_b32 s7, s7, 24
 ; SI-NEXT:    s_lshr_b32 s10, s9, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 6
 ; SI-NEXT:    s_cselect_b32 s10, s10, 5
 ; SI-NEXT:    s_and_b32 s10, s10, 0xff
-; SI-NEXT:    s_or_b32 s7, s10, s7
-; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_or_b32 s7, s7, s10
 ; SI-NEXT:    s_lshr_b32 s10, s9, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 5
 ; SI-NEXT:    s_cselect_b32 s10, s10, 5
@@ -1273,13 +1273,13 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_lshr_b32 s9, s8, 24
 ; SI-NEXT:    s_cmp_lg_u32 s6, 3
 ; SI-NEXT:    s_cselect_b32 s9, s9, 5
-; SI-NEXT:    s_lshl_b32 s9, s9, 8
+; SI-NEXT:    s_lshl_b32 s9, s9, 24
 ; SI-NEXT:    s_lshr_b32 s10, s8, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 2
 ; SI-NEXT:    s_cselect_b32 s10, s10, 5
 ; SI-NEXT:    s_and_b32 s10, s10, 0xff
-; SI-NEXT:    s_or_b32 s9, s10, s9
-; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_or_b32 s9, s9, s10
 ; SI-NEXT:    s_lshr_b32 s10, s8, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 1
 ; SI-NEXT:    s_cselect_b32 s10, s10, 5

diff  --git a/llvm/test/CodeGen/BPF/pr57872.ll b/llvm/test/CodeGen/BPF/pr57872.ll
index 95291e550229e..a9162496c9c15 100644
--- a/llvm/test/CodeGen/BPF/pr57872.ll
+++ b/llvm/test/CodeGen/BPF/pr57872.ll
@@ -1,9 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=bpf-- | FileCheck %s
-; XFAIL: *
 
 %struct.event = type { i8, [84 x i8] }
 
 define void @foo(ptr %g) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    r1 = *(u64 *)(r1 + 0)
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 83)
+; CHECK-NEXT:    *(u8 *)(r10 - 4) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 82)
+; CHECK-NEXT:    *(u8 *)(r10 - 5) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 81)
+; CHECK-NEXT:    *(u8 *)(r10 - 6) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 80)
+; CHECK-NEXT:    *(u8 *)(r10 - 7) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 79)
+; CHECK-NEXT:    *(u8 *)(r10 - 8) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 78)
+; CHECK-NEXT:    *(u8 *)(r10 - 9) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 77)
+; CHECK-NEXT:    *(u8 *)(r10 - 10) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 76)
+; CHECK-NEXT:    *(u8 *)(r10 - 11) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 75)
+; CHECK-NEXT:    *(u8 *)(r10 - 12) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 74)
+; CHECK-NEXT:    *(u8 *)(r10 - 13) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 73)
+; CHECK-NEXT:    *(u8 *)(r10 - 14) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 72)
+; CHECK-NEXT:    *(u8 *)(r10 - 15) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 71)
+; CHECK-NEXT:    *(u8 *)(r10 - 16) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 70)
+; CHECK-NEXT:    *(u8 *)(r10 - 17) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 69)
+; CHECK-NEXT:    *(u8 *)(r10 - 18) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 68)
+; CHECK-NEXT:    *(u8 *)(r10 - 19) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 67)
+; CHECK-NEXT:    *(u8 *)(r10 - 20) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 66)
+; CHECK-NEXT:    *(u8 *)(r10 - 21) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 65)
+; CHECK-NEXT:    *(u8 *)(r10 - 22) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 64)
+; CHECK-NEXT:    *(u8 *)(r10 - 23) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 63)
+; CHECK-NEXT:    *(u8 *)(r10 - 24) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 62)
+; CHECK-NEXT:    *(u8 *)(r10 - 25) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 61)
+; CHECK-NEXT:    *(u8 *)(r10 - 26) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 60)
+; CHECK-NEXT:    *(u8 *)(r10 - 27) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 59)
+; CHECK-NEXT:    *(u8 *)(r10 - 28) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 58)
+; CHECK-NEXT:    *(u8 *)(r10 - 29) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 57)
+; CHECK-NEXT:    *(u8 *)(r10 - 30) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 56)
+; CHECK-NEXT:    *(u8 *)(r10 - 31) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 55)
+; CHECK-NEXT:    *(u8 *)(r10 - 32) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 54)
+; CHECK-NEXT:    *(u8 *)(r10 - 33) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 53)
+; CHECK-NEXT:    *(u8 *)(r10 - 34) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 52)
+; CHECK-NEXT:    *(u8 *)(r10 - 35) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 51)
+; CHECK-NEXT:    *(u8 *)(r10 - 36) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 50)
+; CHECK-NEXT:    *(u8 *)(r10 - 37) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 49)
+; CHECK-NEXT:    *(u8 *)(r10 - 38) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 48)
+; CHECK-NEXT:    *(u8 *)(r10 - 39) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 47)
+; CHECK-NEXT:    *(u8 *)(r10 - 40) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 46)
+; CHECK-NEXT:    *(u8 *)(r10 - 41) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 45)
+; CHECK-NEXT:    *(u8 *)(r10 - 42) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 44)
+; CHECK-NEXT:    *(u8 *)(r10 - 43) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 43)
+; CHECK-NEXT:    *(u8 *)(r10 - 44) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 42)
+; CHECK-NEXT:    *(u8 *)(r10 - 45) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 41)
+; CHECK-NEXT:    *(u8 *)(r10 - 46) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 40)
+; CHECK-NEXT:    *(u8 *)(r10 - 47) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 39)
+; CHECK-NEXT:    *(u8 *)(r10 - 48) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 38)
+; CHECK-NEXT:    *(u8 *)(r10 - 49) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 37)
+; CHECK-NEXT:    *(u8 *)(r10 - 50) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 36)
+; CHECK-NEXT:    *(u8 *)(r10 - 51) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 35)
+; CHECK-NEXT:    *(u8 *)(r10 - 52) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 34)
+; CHECK-NEXT:    *(u8 *)(r10 - 53) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 33)
+; CHECK-NEXT:    *(u8 *)(r10 - 54) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 32)
+; CHECK-NEXT:    *(u8 *)(r10 - 55) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 31)
+; CHECK-NEXT:    *(u8 *)(r10 - 56) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 30)
+; CHECK-NEXT:    *(u8 *)(r10 - 57) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 29)
+; CHECK-NEXT:    *(u8 *)(r10 - 58) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 28)
+; CHECK-NEXT:    *(u8 *)(r10 - 59) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 27)
+; CHECK-NEXT:    *(u8 *)(r10 - 60) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 26)
+; CHECK-NEXT:    *(u8 *)(r10 - 61) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 25)
+; CHECK-NEXT:    *(u8 *)(r10 - 62) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 24)
+; CHECK-NEXT:    *(u8 *)(r10 - 63) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 23)
+; CHECK-NEXT:    *(u8 *)(r10 - 64) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 22)
+; CHECK-NEXT:    *(u8 *)(r10 - 65) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 21)
+; CHECK-NEXT:    *(u8 *)(r10 - 66) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 20)
+; CHECK-NEXT:    *(u8 *)(r10 - 67) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 19)
+; CHECK-NEXT:    *(u8 *)(r10 - 68) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 18)
+; CHECK-NEXT:    *(u8 *)(r10 - 69) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 17)
+; CHECK-NEXT:    *(u8 *)(r10 - 70) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 16)
+; CHECK-NEXT:    *(u8 *)(r10 - 71) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 15)
+; CHECK-NEXT:    *(u8 *)(r10 - 72) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 14)
+; CHECK-NEXT:    *(u8 *)(r10 - 73) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 13)
+; CHECK-NEXT:    *(u8 *)(r10 - 74) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 12)
+; CHECK-NEXT:    *(u8 *)(r10 - 75) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 11)
+; CHECK-NEXT:    *(u8 *)(r10 - 76) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 10)
+; CHECK-NEXT:    *(u8 *)(r10 - 77) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 9)
+; CHECK-NEXT:    *(u8 *)(r10 - 78) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 8)
+; CHECK-NEXT:    *(u8 *)(r10 - 79) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 7)
+; CHECK-NEXT:    *(u8 *)(r10 - 80) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 6)
+; CHECK-NEXT:    *(u8 *)(r10 - 81) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 5)
+; CHECK-NEXT:    *(u8 *)(r10 - 82) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 4)
+; CHECK-NEXT:    *(u8 *)(r10 - 83) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 3)
+; CHECK-NEXT:    *(u8 *)(r10 - 84) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 2)
+; CHECK-NEXT:    *(u8 *)(r10 - 85) = r2
+; CHECK-NEXT:    r2 = *(u8 *)(r1 + 1)
+; CHECK-NEXT:    *(u8 *)(r10 - 86) = r2
+; CHECK-NEXT:    r1 = *(u8 *)(r1 + 0)
+; CHECK-NEXT:    *(u8 *)(r10 - 87) = r1
+; CHECK-NEXT:    r1 = r10
+; CHECK-NEXT:    r1 += -88
+; CHECK-NEXT:    call bar
+; CHECK-NEXT:    exit
 entry:
   %event = alloca %struct.event, align 1
   %hostname = getelementptr inbounds %struct.event, ptr %event, i64 0, i32 1

diff  --git a/llvm/test/CodeGen/Mips/cconv/return-struct.ll b/llvm/test/CodeGen/Mips/cconv/return-struct.ll
index d7d3b49ebe496..49964d69a70d5 100644
--- a/llvm/test/CodeGen/Mips/cconv/return-struct.ll
+++ b/llvm/test/CodeGen/Mips/cconv/return-struct.ll
@@ -175,12 +175,12 @@ define inreg {i48} @ret_struct_3xi16() nounwind {
 ; N32-BE:       # %bb.0: # %entry
 ; N32-BE-NEXT:    lui $1, %hi(struct_3xi16)
 ; N32-BE-NEXT:    lw $2, %lo(struct_3xi16)($1)
-; N32-BE-NEXT:    dsll $2, $2, 16
+; N32-BE-NEXT:    dsll $2, $2, 32
 ; N32-BE-NEXT:    addiu $1, $1, %lo(struct_3xi16)
 ; N32-BE-NEXT:    lhu $1, 4($1)
-; N32-BE-NEXT:    or $1, $1, $2
+; N32-BE-NEXT:    dsll $1, $1, 16
 ; N32-BE-NEXT:    jr $ra
-; N32-BE-NEXT:    dsll $2, $1, 16
+; N32-BE-NEXT:    or $2, $2, $1
 ;
 ; N32-LE-LABEL: ret_struct_3xi16:
 ; N32-LE:       # %bb.0: # %entry
@@ -200,12 +200,12 @@ define inreg {i48} @ret_struct_3xi16() nounwind {
 ; N64-BE-NEXT:    daddiu $1, $1, %hi(struct_3xi16)
 ; N64-BE-NEXT:    dsll $1, $1, 16
 ; N64-BE-NEXT:    lw $2, %lo(struct_3xi16)($1)
-; N64-BE-NEXT:    dsll $2, $2, 16
+; N64-BE-NEXT:    dsll $2, $2, 32
 ; N64-BE-NEXT:    daddiu $1, $1, %lo(struct_3xi16)
 ; N64-BE-NEXT:    lhu $1, 4($1)
-; N64-BE-NEXT:    or $1, $1, $2
+; N64-BE-NEXT:    dsll $1, $1, 16
 ; N64-BE-NEXT:    jr $ra
-; N64-BE-NEXT:    dsll $2, $1, 16
+; N64-BE-NEXT:    or $2, $2, $1
 ;
 ; N64-LE-LABEL: ret_struct_3xi16:
 ; N64-LE:       # %bb.0: # %entry

diff  --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll
index d64f7b09ba41f..3ca0e72b84cb2 100644
--- a/llvm/test/CodeGen/Mips/cconv/vector.ll
+++ b/llvm/test/CodeGen/Mips/cconv/vector.ll
@@ -469,14 +469,14 @@ define <4 x i8> @i8_4(<4 x i8> %a, <4 x i8> %b) {
 ; MIPS32-NEXT:    srl $1, $5, 24
 ; MIPS32-NEXT:    srl $2, $4, 24
 ; MIPS32-NEXT:    addu $1, $2, $1
-; MIPS32-NEXT:    sll $1, $1, 8
-; MIPS32-NEXT:    srl $2, $5, 16
-; MIPS32-NEXT:    srl $3, $4, 16
-; MIPS32-NEXT:    addu $2, $3, $2
-; MIPS32-NEXT:    andi $2, $2, 255
-; MIPS32-NEXT:    or $1, $2, $1
 ; MIPS32-NEXT:    addu $2, $4, $5
-; MIPS32-NEXT:    sll $1, $1, 16
+; MIPS32-NEXT:    sll $1, $1, 24
+; MIPS32-NEXT:    srl $3, $5, 16
+; MIPS32-NEXT:    srl $6, $4, 16
+; MIPS32-NEXT:    addu $3, $6, $3
+; MIPS32-NEXT:    andi $3, $3, 255
+; MIPS32-NEXT:    sll $3, $3, 16
+; MIPS32-NEXT:    or $1, $1, $3
 ; MIPS32-NEXT:    andi $2, $2, 255
 ; MIPS32-NEXT:    srl $3, $5, 8
 ; MIPS32-NEXT:    srl $4, $4, 8
@@ -495,14 +495,14 @@ define <4 x i8> @i8_4(<4 x i8> %a, <4 x i8> %b) {
 ; MIPS64-NEXT:    sll $3, $4, 0
 ; MIPS64-NEXT:    srl $4, $3, 24
 ; MIPS64-NEXT:    addu $2, $4, $2
-; MIPS64-NEXT:    sll $2, $2, 8
-; MIPS64-NEXT:    srl $4, $1, 16
-; MIPS64-NEXT:    srl $5, $3, 16
-; MIPS64-NEXT:    addu $4, $5, $4
-; MIPS64-NEXT:    andi $4, $4, 255
-; MIPS64-NEXT:    or $2, $4, $2
 ; MIPS64-NEXT:    addu $4, $3, $1
-; MIPS64-NEXT:    sll $2, $2, 16
+; MIPS64-NEXT:    sll $2, $2, 24
+; MIPS64-NEXT:    srl $5, $1, 16
+; MIPS64-NEXT:    srl $6, $3, 16
+; MIPS64-NEXT:    addu $5, $6, $5
+; MIPS64-NEXT:    andi $5, $5, 255
+; MIPS64-NEXT:    sll $5, $5, 16
+; MIPS64-NEXT:    or $2, $2, $5
 ; MIPS64-NEXT:    andi $4, $4, 255
 ; MIPS64-NEXT:    srl $1, $1, 8
 ; MIPS64-NEXT:    srl $3, $3, 8
@@ -592,37 +592,37 @@ define <4 x i8> @i8_4(<4 x i8> %a, <4 x i8> %b) {
 define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) {
 ; MIPS32-LABEL: i8_8:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    srl $1, $6, 24
-; MIPS32-NEXT:    srl $2, $4, 24
-; MIPS32-NEXT:    addu $1, $2, $1
-; MIPS32-NEXT:    sll $1, $1, 8
-; MIPS32-NEXT:    srl $2, $6, 16
-; MIPS32-NEXT:    srl $3, $4, 16
+; MIPS32-NEXT:    addu $1, $4, $6
+; MIPS32-NEXT:    srl $2, $6, 24
+; MIPS32-NEXT:    srl $3, $4, 24
 ; MIPS32-NEXT:    addu $2, $3, $2
-; MIPS32-NEXT:    andi $2, $2, 255
-; MIPS32-NEXT:    srl $3, $7, 24
-; MIPS32-NEXT:    srl $8, $5, 24
-; MIPS32-NEXT:    or $1, $2, $1
-; MIPS32-NEXT:    addu $2, $8, $3
-; MIPS32-NEXT:    addu $3, $4, $6
-; MIPS32-NEXT:    sll $2, $2, 8
-; MIPS32-NEXT:    srl $8, $7, 16
-; MIPS32-NEXT:    srl $9, $5, 16
-; MIPS32-NEXT:    addu $8, $9, $8
-; MIPS32-NEXT:    andi $8, $8, 255
-; MIPS32-NEXT:    or $8, $8, $2
-; MIPS32-NEXT:    sll $1, $1, 16
-; MIPS32-NEXT:    andi $2, $3, 255
+; MIPS32-NEXT:    andi $1, $1, 255
 ; MIPS32-NEXT:    srl $3, $6, 8
-; MIPS32-NEXT:    srl $4, $4, 8
-; MIPS32-NEXT:    addu $3, $4, $3
+; MIPS32-NEXT:    srl $8, $4, 8
+; MIPS32-NEXT:    addu $3, $8, $3
 ; MIPS32-NEXT:    sll $3, $3, 8
-; MIPS32-NEXT:    or $2, $2, $3
-; MIPS32-NEXT:    andi $2, $2, 65535
-; MIPS32-NEXT:    addu $3, $5, $7
-; MIPS32-NEXT:    or $2, $2, $1
-; MIPS32-NEXT:    sll $1, $8, 16
+; MIPS32-NEXT:    srl $6, $6, 16
+; MIPS32-NEXT:    srl $4, $4, 16
+; MIPS32-NEXT:    or $1, $1, $3
+; MIPS32-NEXT:    sll $2, $2, 24
+; MIPS32-NEXT:    addu $3, $4, $6
 ; MIPS32-NEXT:    andi $3, $3, 255
+; MIPS32-NEXT:    sll $3, $3, 16
+; MIPS32-NEXT:    srl $4, $7, 24
+; MIPS32-NEXT:    srl $6, $5, 24
+; MIPS32-NEXT:    or $2, $2, $3
+; MIPS32-NEXT:    andi $1, $1, 65535
+; MIPS32-NEXT:    addu $3, $6, $4
+; MIPS32-NEXT:    addu $4, $5, $7
+; MIPS32-NEXT:    sll $3, $3, 24
+; MIPS32-NEXT:    srl $6, $7, 16
+; MIPS32-NEXT:    srl $8, $5, 16
+; MIPS32-NEXT:    addu $6, $8, $6
+; MIPS32-NEXT:    andi $6, $6, 255
+; MIPS32-NEXT:    sll $6, $6, 16
+; MIPS32-NEXT:    or $2, $1, $2
+; MIPS32-NEXT:    or $1, $3, $6
+; MIPS32-NEXT:    andi $3, $4, 255
 ; MIPS32-NEXT:    srl $4, $7, 8
 ; MIPS32-NEXT:    srl $5, $5, 8
 ; MIPS32-NEXT:    addu $4, $5, $4
@@ -635,57 +635,57 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) {
 ;
 ; MIPS64-LABEL: i8_8:
 ; MIPS64:       # %bb.0:
-; MIPS64-NEXT:    dsrl $1, $5, 56
+; MIPS64-NEXT:    dsrl $1, $5, 48
 ; MIPS64-NEXT:    sll $1, $1, 0
-; MIPS64-NEXT:    dsrl $2, $4, 56
+; MIPS64-NEXT:    dsrl $2, $4, 48
 ; MIPS64-NEXT:    sll $2, $2, 0
 ; MIPS64-NEXT:    addu $1, $2, $1
-; MIPS64-NEXT:    dsrl $2, $5, 48
-; MIPS64-NEXT:    sll $1, $1, 8
+; MIPS64-NEXT:    dsrl $2, $5, 56
+; MIPS64-NEXT:    andi $1, $1, 255
 ; MIPS64-NEXT:    sll $2, $2, 0
-; MIPS64-NEXT:    dsrl $3, $4, 48
+; MIPS64-NEXT:    dsrl $3, $4, 56
 ; MIPS64-NEXT:    sll $3, $3, 0
 ; MIPS64-NEXT:    addu $2, $3, $2
-; MIPS64-NEXT:    andi $2, $2, 255
 ; MIPS64-NEXT:    dsrl $3, $5, 40
-; MIPS64-NEXT:    or $1, $2, $1
-; MIPS64-NEXT:    sll $2, $5, 0
+; MIPS64-NEXT:    sll $2, $2, 24
+; MIPS64-NEXT:    sll $1, $1, 16
 ; MIPS64-NEXT:    sll $3, $3, 0
 ; MIPS64-NEXT:    dsrl $6, $4, 40
 ; MIPS64-NEXT:    sll $6, $6, 0
 ; MIPS64-NEXT:    addu $3, $6, $3
-; MIPS64-NEXT:    dsrl $5, $5, 32
-; MIPS64-NEXT:    srl $6, $2, 24
+; MIPS64-NEXT:    dsrl $6, $5, 32
 ; MIPS64-NEXT:    sll $7, $4, 0
-; MIPS64-NEXT:    srl $8, $7, 24
-; MIPS64-NEXT:    addu $6, $8, $6
-; MIPS64-NEXT:    sll $1, $1, 16
-; MIPS64-NEXT:    sll $3, $3, 8
 ; MIPS64-NEXT:    sll $5, $5, 0
+; MIPS64-NEXT:    srl $8, $5, 24
+; MIPS64-NEXT:    srl $9, $7, 24
+; MIPS64-NEXT:    or $1, $2, $1
+; MIPS64-NEXT:    sll $2, $3, 8
+; MIPS64-NEXT:    sll $3, $6, 0
 ; MIPS64-NEXT:    dsrl $4, $4, 32
 ; MIPS64-NEXT:    sll $4, $4, 0
-; MIPS64-NEXT:    addu $4, $4, $5
-; MIPS64-NEXT:    andi $4, $4, 255
-; MIPS64-NEXT:    or $3, $4, $3
-; MIPS64-NEXT:    andi $3, $3, 65535
-; MIPS64-NEXT:    or $1, $3, $1
-; MIPS64-NEXT:    sll $3, $6, 8
-; MIPS64-NEXT:    srl $4, $2, 16
-; MIPS64-NEXT:    srl $5, $7, 16
-; MIPS64-NEXT:    addu $4, $5, $4
+; MIPS64-NEXT:    addu $3, $4, $3
+; MIPS64-NEXT:    andi $3, $3, 255
+; MIPS64-NEXT:    or $2, $3, $2
+; MIPS64-NEXT:    andi $2, $2, 65535
+; MIPS64-NEXT:    or $1, $2, $1
+; MIPS64-NEXT:    addu $2, $9, $8
+; MIPS64-NEXT:    addu $3, $7, $5
+; MIPS64-NEXT:    sll $2, $2, 24
+; MIPS64-NEXT:    srl $4, $5, 16
+; MIPS64-NEXT:    srl $6, $7, 16
+; MIPS64-NEXT:    addu $4, $6, $4
 ; MIPS64-NEXT:    andi $4, $4, 255
-; MIPS64-NEXT:    or $3, $4, $3
-; MIPS64-NEXT:    addu $4, $7, $2
+; MIPS64-NEXT:    sll $4, $4, 16
 ; MIPS64-NEXT:    dsll $1, $1, 32
-; MIPS64-NEXT:    sll $3, $3, 16
-; MIPS64-NEXT:    andi $4, $4, 255
-; MIPS64-NEXT:    srl $2, $2, 8
+; MIPS64-NEXT:    or $2, $2, $4
+; MIPS64-NEXT:    andi $3, $3, 255
+; MIPS64-NEXT:    srl $4, $5, 8
 ; MIPS64-NEXT:    srl $5, $7, 8
-; MIPS64-NEXT:    addu $2, $5, $2
-; MIPS64-NEXT:    sll $2, $2, 8
-; MIPS64-NEXT:    or $2, $4, $2
-; MIPS64-NEXT:    andi $2, $2, 65535
-; MIPS64-NEXT:    or $2, $2, $3
+; MIPS64-NEXT:    addu $4, $5, $4
+; MIPS64-NEXT:    sll $4, $4, 8
+; MIPS64-NEXT:    or $3, $3, $4
+; MIPS64-NEXT:    andi $3, $3, 65535
+; MIPS64-NEXT:    or $2, $3, $2
 ; MIPS64-NEXT:    dsll $2, $2, 32
 ; MIPS64-NEXT:    dsrl $2, $2, 32
 ; MIPS64-NEXT:    or $2, $2, $1
@@ -916,84 +916,84 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) {
 define <16 x i8> @i8_16(<16 x i8> %a, <16 x i8> %b) {
 ; MIPS32-LABEL: i8_16:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lw $1, 24($sp)
-; MIPS32-NEXT:    srl $2, $1, 24
-; MIPS32-NEXT:    srl $3, $6, 24
-; MIPS32-NEXT:    srl $8, $1, 16
-; MIPS32-NEXT:    srl $9, $6, 16
-; MIPS32-NEXT:    srl $10, $1, 8
-; MIPS32-NEXT:    srl $11, $6, 8
-; MIPS32-NEXT:    lw $12, 20($sp)
-; MIPS32-NEXT:    srl $13, $12, 8
-; MIPS32-NEXT:    srl $14, $5, 8
-; MIPS32-NEXT:    addu $13, $14, $13
-; MIPS32-NEXT:    addu $14, $5, $12
-; MIPS32-NEXT:    addu $10, $11, $10
-; MIPS32-NEXT:    addu $1, $6, $1
-; MIPS32-NEXT:    addu $6, $9, $8
-; MIPS32-NEXT:    addu $2, $3, $2
-; MIPS32-NEXT:    srl $3, $12, 24
-; MIPS32-NEXT:    srl $8, $5, 24
-; MIPS32-NEXT:    srl $9, $12, 16
-; MIPS32-NEXT:    srl $5, $5, 16
-; MIPS32-NEXT:    addu $5, $5, $9
-; MIPS32-NEXT:    addu $3, $8, $3
-; MIPS32-NEXT:    sll $2, $2, 8
-; MIPS32-NEXT:    andi $6, $6, 255
-; MIPS32-NEXT:    andi $1, $1, 255
-; MIPS32-NEXT:    sll $8, $10, 8
-; MIPS32-NEXT:    andi $9, $14, 255
-; MIPS32-NEXT:    sll $10, $13, 8
-; MIPS32-NEXT:    lw $11, 28($sp)
-; MIPS32-NEXT:    lw $12, 16($sp)
-; MIPS32-NEXT:    srl $13, $12, 24
-; MIPS32-NEXT:    srl $14, $4, 24
-; MIPS32-NEXT:    srl $15, $11, 24
-; MIPS32-NEXT:    srl $24, $7, 24
-; MIPS32-NEXT:    or $9, $9, $10
-; MIPS32-NEXT:    or $1, $1, $8
-; MIPS32-NEXT:    or $2, $6, $2
-; MIPS32-NEXT:    addu $6, $24, $15
-; MIPS32-NEXT:    sll $3, $3, 8
-; MIPS32-NEXT:    andi $5, $5, 255
-; MIPS32-NEXT:    addu $8, $14, $13
-; MIPS32-NEXT:    sll $8, $8, 8
-; MIPS32-NEXT:    srl $10, $12, 16
+; MIPS32-NEXT:    lw $1, 16($sp)
+; MIPS32-NEXT:    lw $2, 20($sp)
+; MIPS32-NEXT:    lw $3, 24($sp)
+; MIPS32-NEXT:    srl $8, $3, 8
+; MIPS32-NEXT:    srl $9, $6, 8
+; MIPS32-NEXT:    srl $10, $2, 16
+; MIPS32-NEXT:    srl $11, $5, 16
+; MIPS32-NEXT:    srl $12, $1, 16
 ; MIPS32-NEXT:    srl $13, $4, 16
-; MIPS32-NEXT:    addu $10, $13, $10
-; MIPS32-NEXT:    andi $10, $10, 255
-; MIPS32-NEXT:    or $8, $10, $8
-; MIPS32-NEXT:    or $3, $5, $3
-; MIPS32-NEXT:    addu $5, $4, $12
-; MIPS32-NEXT:    sll $6, $6, 8
-; MIPS32-NEXT:    srl $10, $11, 16
-; MIPS32-NEXT:    srl $13, $7, 16
-; MIPS32-NEXT:    addu $10, $13, $10
+; MIPS32-NEXT:    srl $14, $1, 8
+; MIPS32-NEXT:    srl $15, $4, 8
+; MIPS32-NEXT:    addu $24, $6, $3
+; MIPS32-NEXT:    addu $14, $15, $14
+; MIPS32-NEXT:    addu $15, $4, $1
+; MIPS32-NEXT:    addu $12, $13, $12
+; MIPS32-NEXT:    addu $10, $11, $10
+; MIPS32-NEXT:    srl $11, $2, 24
+; MIPS32-NEXT:    addu $13, $5, $2
+; MIPS32-NEXT:    addu $8, $9, $8
+; MIPS32-NEXT:    srl $1, $1, 24
+; MIPS32-NEXT:    srl $4, $4, 24
+; MIPS32-NEXT:    srl $9, $5, 24
+; MIPS32-NEXT:    srl $25, $3, 24
+; MIPS32-NEXT:    srl $gp, $6, 24
+; MIPS32-NEXT:    addu $25, $gp, $25
 ; MIPS32-NEXT:    andi $10, $10, 255
-; MIPS32-NEXT:    or $6, $10, $6
-; MIPS32-NEXT:    sll $10, $2, 16
-; MIPS32-NEXT:    andi $1, $1, 65535
+; MIPS32-NEXT:    addu $9, $9, $11
+; MIPS32-NEXT:    andi $11, $12, 255
+; MIPS32-NEXT:    addu $1, $4, $1
+; MIPS32-NEXT:    andi $4, $15, 255
+; MIPS32-NEXT:    sll $12, $14, 8
+; MIPS32-NEXT:    andi $14, $24, 255
+; MIPS32-NEXT:    sll $8, $8, 8
+; MIPS32-NEXT:    andi $13, $13, 255
+; MIPS32-NEXT:    srl $2, $2, 8
+; MIPS32-NEXT:    srl $5, $5, 8
+; MIPS32-NEXT:    addu $2, $5, $2
+; MIPS32-NEXT:    sll $2, $2, 8
+; MIPS32-NEXT:    srl $3, $3, 16
+; MIPS32-NEXT:    srl $5, $6, 16
+; MIPS32-NEXT:    or $2, $13, $2
+; MIPS32-NEXT:    or $6, $14, $8
+; MIPS32-NEXT:    or $4, $4, $12
+; MIPS32-NEXT:    sll $1, $1, 24
+; MIPS32-NEXT:    sll $8, $11, 16
+; MIPS32-NEXT:    sll $9, $9, 24
+; MIPS32-NEXT:    sll $10, $10, 16
+; MIPS32-NEXT:    sll $11, $25, 24
+; MIPS32-NEXT:    addu $3, $5, $3
+; MIPS32-NEXT:    andi $3, $3, 255
 ; MIPS32-NEXT:    sll $3, $3, 16
-; MIPS32-NEXT:    andi $9, $9, 65535
-; MIPS32-NEXT:    sll $2, $8, 16
-; MIPS32-NEXT:    andi $5, $5, 255
-; MIPS32-NEXT:    srl $8, $12, 8
-; MIPS32-NEXT:    srl $4, $4, 8
-; MIPS32-NEXT:    addu $4, $4, $8
-; MIPS32-NEXT:    sll $4, $4, 8
-; MIPS32-NEXT:    or $4, $5, $4
+; MIPS32-NEXT:    lw $5, 28($sp)
+; MIPS32-NEXT:    srl $12, $5, 24
+; MIPS32-NEXT:    srl $13, $7, 24
+; MIPS32-NEXT:    or $11, $11, $3
+; MIPS32-NEXT:    or $3, $9, $10
+; MIPS32-NEXT:    or $1, $1, $8
 ; MIPS32-NEXT:    andi $4, $4, 65535
-; MIPS32-NEXT:    addu $5, $7, $11
-; MIPS32-NEXT:    or $2, $4, $2
+; MIPS32-NEXT:    addu $8, $13, $12
+; MIPS32-NEXT:    andi $6, $6, 65535
+; MIPS32-NEXT:    andi $9, $2, 65535
+; MIPS32-NEXT:    addu $10, $7, $5
+; MIPS32-NEXT:    sll $8, $8, 24
+; MIPS32-NEXT:    srl $2, $5, 16
+; MIPS32-NEXT:    srl $12, $7, 16
+; MIPS32-NEXT:    addu $2, $12, $2
+; MIPS32-NEXT:    andi $2, $2, 255
+; MIPS32-NEXT:    sll $12, $2, 16
+; MIPS32-NEXT:    or $2, $4, $1
 ; MIPS32-NEXT:    or $3, $9, $3
-; MIPS32-NEXT:    or $4, $1, $10
-; MIPS32-NEXT:    sll $1, $6, 16
-; MIPS32-NEXT:    andi $5, $5, 255
-; MIPS32-NEXT:    srl $6, $11, 8
+; MIPS32-NEXT:    or $4, $6, $11
+; MIPS32-NEXT:    or $1, $8, $12
+; MIPS32-NEXT:    andi $6, $10, 255
+; MIPS32-NEXT:    srl $5, $5, 8
 ; MIPS32-NEXT:    srl $7, $7, 8
-; MIPS32-NEXT:    addu $6, $7, $6
-; MIPS32-NEXT:    sll $6, $6, 8
-; MIPS32-NEXT:    or $5, $5, $6
+; MIPS32-NEXT:    addu $5, $7, $5
+; MIPS32-NEXT:    sll $5, $5, 8
+; MIPS32-NEXT:    or $5, $6, $5
 ; MIPS32-NEXT:    andi $5, $5, 65535
 ; MIPS32-NEXT:    or $5, $5, $1
 ; MIPS32-NEXT:    jr $ra
@@ -1001,111 +1001,111 @@ define <16 x i8> @i8_16(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; MIPS64-LABEL: i8_16:
 ; MIPS64:       # %bb.0:
-; MIPS64-NEXT:    dsrl $1, $7, 56
-; MIPS64-NEXT:    dsrl $2, $5, 56
-; MIPS64-NEXT:    dsrl $3, $7, 48
-; MIPS64-NEXT:    dsrl $8, $5, 48
-; MIPS64-NEXT:    dsrl $9, $6, 56
-; MIPS64-NEXT:    dsrl $10, $4, 56
-; MIPS64-NEXT:    dsrl $11, $7, 32
-; MIPS64-NEXT:    sll $1, $1, 0
+; MIPS64-NEXT:    sll $1, $6, 0
+; MIPS64-NEXT:    dsrl $2, $6, 56
+; MIPS64-NEXT:    dsrl $3, $6, 48
+; MIPS64-NEXT:    dsrl $8, $4, 48
+; MIPS64-NEXT:    srl $9, $1, 16
+; MIPS64-NEXT:    sll $10, $4, 0
+; MIPS64-NEXT:    srl $11, $10, 16
+; MIPS64-NEXT:    dsrl $12, $7, 56
+; MIPS64-NEXT:    addu $13, $10, $1
+; MIPS64-NEXT:    addu $9, $11, $9
 ; MIPS64-NEXT:    sll $2, $2, 0
+; MIPS64-NEXT:    dsrl $11, $7, 48
+; MIPS64-NEXT:    srl $14, $1, 8
+; MIPS64-NEXT:    srl $15, $10, 8
+; MIPS64-NEXT:    addu $14, $15, $14
+; MIPS64-NEXT:    dsrl $15, $4, 56
+; MIPS64-NEXT:    dsrl $24, $7, 40
 ; MIPS64-NEXT:    sll $3, $3, 0
 ; MIPS64-NEXT:    sll $8, $8, 0
-; MIPS64-NEXT:    dsrl $12, $7, 40
-; MIPS64-NEXT:    sll $12, $12, 0
-; MIPS64-NEXT:    dsrl $13, $5, 40
-; MIPS64-NEXT:    sll $13, $13, 0
-; MIPS64-NEXT:    addu $12, $13, $12
-; MIPS64-NEXT:    addu $3, $8, $3
-; MIPS64-NEXT:    addu $1, $2, $1
-; MIPS64-NEXT:    sll $2, $9, 0
-; MIPS64-NEXT:    sll $8, $10, 0
-; MIPS64-NEXT:    dsrl $9, $6, 48
-; MIPS64-NEXT:    sll $9, $9, 0
-; MIPS64-NEXT:    dsrl $10, $4, 48
-; MIPS64-NEXT:    sll $10, $10, 0
-; MIPS64-NEXT:    addu $9, $10, $9
-; MIPS64-NEXT:    addu $2, $8, $2
-; MIPS64-NEXT:    sll $8, $1, 8
-; MIPS64-NEXT:    andi $3, $3, 255
-; MIPS64-NEXT:    sll $1, $12, 8
-; MIPS64-NEXT:    sll $10, $11, 0
-; MIPS64-NEXT:    dsrl $11, $5, 32
-; MIPS64-NEXT:    sll $11, $11, 0
-; MIPS64-NEXT:    addu $10, $11, $10
-; MIPS64-NEXT:    andi $10, $10, 255
-; MIPS64-NEXT:    or $10, $10, $1
-; MIPS64-NEXT:    sll $1, $6, 0
-; MIPS64-NEXT:    or $8, $3, $8
-; MIPS64-NEXT:    sll $2, $2, 8
+; MIPS64-NEXT:    sll $15, $15, 0
 ; MIPS64-NEXT:    andi $9, $9, 255
-; MIPS64-NEXT:    dsrl $11, $6, 40
-; MIPS64-NEXT:    srl $3, $1, 24
-; MIPS64-NEXT:    sll $12, $4, 0
-; MIPS64-NEXT:    srl $13, $12, 24
-; MIPS64-NEXT:    srl $14, $1, 16
-; MIPS64-NEXT:    srl $15, $12, 16
-; MIPS64-NEXT:    andi $10, $10, 65535
-; MIPS64-NEXT:    addu $14, $15, $14
-; MIPS64-NEXT:    addu $13, $13, $3
-; MIPS64-NEXT:    sll $3, $7, 0
-; MIPS64-NEXT:    or $2, $9, $2
-; MIPS64-NEXT:    sll $7, $8, 16
+; MIPS64-NEXT:    addu $2, $15, $2
+; MIPS64-NEXT:    andi $13, $13, 255
+; MIPS64-NEXT:    sll $14, $14, 8
+; MIPS64-NEXT:    addu $3, $8, $3
 ; MIPS64-NEXT:    sll $8, $11, 0
-; MIPS64-NEXT:    dsrl $9, $4, 40
-; MIPS64-NEXT:    sll $9, $9, 0
-; MIPS64-NEXT:    addu $8, $9, $8
+; MIPS64-NEXT:    srl $1, $1, 24
+; MIPS64-NEXT:    sll $11, $12, 0
+; MIPS64-NEXT:    dsrl $12, $5, 56
+; MIPS64-NEXT:    dsrl $15, $5, 48
+; MIPS64-NEXT:    andi $3, $3, 255
+; MIPS64-NEXT:    dsrl $25, $6, 40
+; MIPS64-NEXT:    sll $15, $15, 0
+; MIPS64-NEXT:    srl $10, $10, 24
+; MIPS64-NEXT:    sll $12, $12, 0
+; MIPS64-NEXT:    or $13, $13, $14
+; MIPS64-NEXT:    sll $14, $24, 0
+; MIPS64-NEXT:    sll $2, $2, 24
+; MIPS64-NEXT:    addu $11, $12, $11
+; MIPS64-NEXT:    sll $9, $9, 16
+; MIPS64-NEXT:    addu $1, $10, $1
+; MIPS64-NEXT:    addu $8, $15, $8
+; MIPS64-NEXT:    sll $10, $25, 0
+; MIPS64-NEXT:    dsrl $12, $4, 40
+; MIPS64-NEXT:    sll $12, $12, 0
+; MIPS64-NEXT:    addu $10, $12, $10
+; MIPS64-NEXT:    sll $3, $3, 16
+; MIPS64-NEXT:    andi $8, $8, 255
+; MIPS64-NEXT:    sll $1, $1, 24
+; MIPS64-NEXT:    dsrl $12, $5, 40
+; MIPS64-NEXT:    sll $12, $12, 0
 ; MIPS64-NEXT:    dsrl $6, $6, 32
-; MIPS64-NEXT:    srl $9, $3, 24
-; MIPS64-NEXT:    sll $5, $5, 0
-; MIPS64-NEXT:    srl $11, $5, 24
-; MIPS64-NEXT:    or $7, $10, $7
-; MIPS64-NEXT:    addu $9, $11, $9
-; MIPS64-NEXT:    sll $10, $13, 8
-; MIPS64-NEXT:    andi $11, $14, 255
-; MIPS64-NEXT:    sll $2, $2, 16
-; MIPS64-NEXT:    sll $8, $8, 8
+; MIPS64-NEXT:    or $1, $1, $9
+; MIPS64-NEXT:    addu $9, $12, $14
+; MIPS64-NEXT:    sll $11, $11, 24
+; MIPS64-NEXT:    sll $8, $8, 16
+; MIPS64-NEXT:    dsrl $12, $7, 32
+; MIPS64-NEXT:    andi $13, $13, 65535
+; MIPS64-NEXT:    or $2, $2, $3
+; MIPS64-NEXT:    sll $3, $10, 8
 ; MIPS64-NEXT:    sll $6, $6, 0
 ; MIPS64-NEXT:    dsrl $4, $4, 32
 ; MIPS64-NEXT:    sll $4, $4, 0
 ; MIPS64-NEXT:    addu $4, $4, $6
 ; MIPS64-NEXT:    andi $4, $4, 255
-; MIPS64-NEXT:    or $4, $4, $8
-; MIPS64-NEXT:    andi $4, $4, 65535
-; MIPS64-NEXT:    or $2, $4, $2
-; MIPS64-NEXT:    or $4, $11, $10
-; MIPS64-NEXT:    addu $6, $12, $1
-; MIPS64-NEXT:    sll $8, $9, 8
-; MIPS64-NEXT:    srl $9, $3, 16
-; MIPS64-NEXT:    srl $10, $5, 16
-; MIPS64-NEXT:    addu $9, $10, $9
-; MIPS64-NEXT:    andi $9, $9, 255
-; MIPS64-NEXT:    or $8, $9, $8
-; MIPS64-NEXT:    addu $9, $5, $3
-; MIPS64-NEXT:    dsll $2, $2, 32
-; MIPS64-NEXT:    sll $4, $4, 16
+; MIPS64-NEXT:    or $3, $4, $3
+; MIPS64-NEXT:    andi $3, $3, 65535
+; MIPS64-NEXT:    or $2, $3, $2
+; MIPS64-NEXT:    or $1, $13, $1
+; MIPS64-NEXT:    or $3, $11, $8
+; MIPS64-NEXT:    sll $4, $9, 8
+; MIPS64-NEXT:    sll $6, $12, 0
+; MIPS64-NEXT:    dsrl $8, $5, 32
+; MIPS64-NEXT:    sll $8, $8, 0
+; MIPS64-NEXT:    addu $6, $8, $6
 ; MIPS64-NEXT:    andi $6, $6, 255
-; MIPS64-NEXT:    srl $1, $1, 8
-; MIPS64-NEXT:    srl $10, $12, 8
-; MIPS64-NEXT:    addu $1, $10, $1
-; MIPS64-NEXT:    sll $1, $1, 8
-; MIPS64-NEXT:    or $1, $6, $1
-; MIPS64-NEXT:    andi $1, $1, 65535
-; MIPS64-NEXT:    or $1, $1, $4
+; MIPS64-NEXT:    or $4, $6, $4
+; MIPS64-NEXT:    andi $4, $4, 65535
 ; MIPS64-NEXT:    dsll $1, $1, 32
+; MIPS64-NEXT:    or $3, $4, $3
+; MIPS64-NEXT:    sll $4, $7, 0
+; MIPS64-NEXT:    srl $6, $4, 24
+; MIPS64-NEXT:    sll $5, $5, 0
+; MIPS64-NEXT:    srl $7, $5, 24
+; MIPS64-NEXT:    addu $8, $5, $4
+; MIPS64-NEXT:    dsll $2, $2, 32
 ; MIPS64-NEXT:    dsrl $1, $1, 32
+; MIPS64-NEXT:    addu $6, $7, $6
+; MIPS64-NEXT:    sll $6, $6, 24
+; MIPS64-NEXT:    srl $7, $4, 16
+; MIPS64-NEXT:    srl $9, $5, 16
+; MIPS64-NEXT:    addu $7, $9, $7
+; MIPS64-NEXT:    andi $7, $7, 255
+; MIPS64-NEXT:    sll $7, $7, 16
 ; MIPS64-NEXT:    or $2, $1, $2
-; MIPS64-NEXT:    dsll $1, $7, 32
-; MIPS64-NEXT:    sll $4, $8, 16
-; MIPS64-NEXT:    andi $6, $9, 255
-; MIPS64-NEXT:    srl $3, $3, 8
+; MIPS64-NEXT:    dsll $1, $3, 32
+; MIPS64-NEXT:    or $3, $6, $7
+; MIPS64-NEXT:    andi $6, $8, 255
+; MIPS64-NEXT:    srl $4, $4, 8
 ; MIPS64-NEXT:    srl $5, $5, 8
-; MIPS64-NEXT:    addu $3, $5, $3
-; MIPS64-NEXT:    sll $3, $3, 8
-; MIPS64-NEXT:    or $3, $6, $3
-; MIPS64-NEXT:    andi $3, $3, 65535
-; MIPS64-NEXT:    or $3, $3, $4
+; MIPS64-NEXT:    addu $4, $5, $4
+; MIPS64-NEXT:    sll $4, $4, 8
+; MIPS64-NEXT:    or $4, $6, $4
+; MIPS64-NEXT:    andi $4, $4, 65535
+; MIPS64-NEXT:    or $3, $4, $3
 ; MIPS64-NEXT:    dsll $3, $3, 32
 ; MIPS64-NEXT:    dsrl $3, $3, 32
 ; MIPS64-NEXT:    or $3, $3, $1
@@ -6617,24 +6617,24 @@ define <2 x i24> @i24x2(<2 x i24> %a, <2 x i24> %b) {
 ; MIPS64R5EB:       # %bb.0: # %Entry
 ; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
 ; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EB-NEXT:    sh $5, 20($sp)
 ; MIPS64R5EB-NEXT:    dsrl $1, $5, 16
 ; MIPS64R5EB-NEXT:    sw $1, 16($sp)
-; MIPS64R5EB-NEXT:    sh $5, 20($sp)
+; MIPS64R5EB-NEXT:    sh $4, 28($sp)
 ; MIPS64R5EB-NEXT:    dsrl $1, $4, 16
 ; MIPS64R5EB-NEXT:    sw $1, 24($sp)
-; MIPS64R5EB-NEXT:    sh $4, 28($sp)
-; MIPS64R5EB-NEXT:    lb $1, 19($sp)
-; MIPS64R5EB-NEXT:    dsll $1, $1, 8
-; MIPS64R5EB-NEXT:    lbu $2, 20($sp)
-; MIPS64R5EB-NEXT:    or $1, $1, $2
+; MIPS64R5EB-NEXT:    lbu $1, 20($sp)
 ; MIPS64R5EB-NEXT:    dsll $1, $1, 8
-; MIPS64R5EB-NEXT:    lb $2, 27($sp)
+; MIPS64R5EB-NEXT:    lb $2, 19($sp)
+; MIPS64R5EB-NEXT:    dsll $2, $2, 16
+; MIPS64R5EB-NEXT:    or $1, $2, $1
+; MIPS64R5EB-NEXT:    lbu $2, 28($sp)
 ; MIPS64R5EB-NEXT:    dsll $2, $2, 8
-; MIPS64R5EB-NEXT:    lbu $3, 28($sp)
-; MIPS64R5EB-NEXT:    or $2, $2, $3
-; MIPS64R5EB-NEXT:    lbu $3, 21($sp)
-; MIPS64R5EB-NEXT:    dsll $2, $2, 8
-; MIPS64R5EB-NEXT:    or $1, $3, $1
+; MIPS64R5EB-NEXT:    lb $3, 27($sp)
+; MIPS64R5EB-NEXT:    dsll $3, $3, 16
+; MIPS64R5EB-NEXT:    lbu $4, 21($sp)
+; MIPS64R5EB-NEXT:    or $2, $3, $2
+; MIPS64R5EB-NEXT:    or $1, $4, $1
 ; MIPS64R5EB-NEXT:    lh $3, 16($sp)
 ; MIPS64R5EB-NEXT:    dsll $3, $3, 8
 ; MIPS64R5EB-NEXT:    lbu $4, 18($sp)

diff  --git a/llvm/test/CodeGen/Mips/load-store-left-right.ll b/llvm/test/CodeGen/Mips/load-store-left-right.ll
index 17d3f480acf55..3eb5053de83e6 100644
--- a/llvm/test/CodeGen/Mips/load-store-left-right.ll
+++ b/llvm/test/CodeGen/Mips/load-store-left-right.ll
@@ -977,12 +977,12 @@ define void @pass_array_byval() nounwind {
 ; MIPS32-EB-NEXT:    addu $gp, $2, $25
 ; MIPS32-EB-NEXT:    lw $1, %got(arr)($gp)
 ; MIPS32-EB-NEXT:    lwl $4, 0($1)
-; MIPS32-EB-NEXT:    lwr $4, 3($1)
 ; MIPS32-EB-NEXT:    lbu $2, 5($1)
+; MIPS32-EB-NEXT:    lwr $4, 3($1)
+; MIPS32-EB-NEXT:    sll $2, $2, 16
 ; MIPS32-EB-NEXT:    lbu $3, 4($1)
-; MIPS32-EB-NEXT:    sll $3, $3, 8
+; MIPS32-EB-NEXT:    sll $3, $3, 24
 ; MIPS32-EB-NEXT:    or $2, $3, $2
-; MIPS32-EB-NEXT:    sll $2, $2, 16
 ; MIPS32-EB-NEXT:    lbu $1, 6($1)
 ; MIPS32-EB-NEXT:    sll $1, $1, 8
 ; MIPS32-EB-NEXT:    lw $25, %call16(extern_func)($gp)
@@ -1046,18 +1046,18 @@ define void @pass_array_byval() nounwind {
 ; MIPS64-EL-NEXT:    daddu $1, $1, $25
 ; MIPS64-EL-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(pass_array_byval)))
 ; MIPS64-EL-NEXT:    ld $1, %got_disp(arr)($gp)
-; MIPS64-EL-NEXT:    lwl $2, 3($1)
-; MIPS64-EL-NEXT:    lwr $2, 0($1)
-; MIPS64-EL-NEXT:    daddiu $3, $zero, 1
-; MIPS64-EL-NEXT:    dsll $3, $3, 32
-; MIPS64-EL-NEXT:    daddiu $3, $3, -1
-; MIPS64-EL-NEXT:    and $2, $2, $3
-; MIPS64-EL-NEXT:    lbu $3, 4($1)
-; MIPS64-EL-NEXT:    lbu $4, 5($1)
-; MIPS64-EL-NEXT:    dsll $4, $4, 8
-; MIPS64-EL-NEXT:    or $3, $4, $3
-; MIPS64-EL-NEXT:    dsll $3, $3, 32
-; MIPS64-EL-NEXT:    or $2, $2, $3
+; MIPS64-EL-NEXT:    lbu $2, 4($1)
+; MIPS64-EL-NEXT:    dsll $2, $2, 32
+; MIPS64-EL-NEXT:    lbu $3, 5($1)
+; MIPS64-EL-NEXT:    dsll $3, $3, 40
+; MIPS64-EL-NEXT:    or $2, $3, $2
+; MIPS64-EL-NEXT:    lwl $3, 3($1)
+; MIPS64-EL-NEXT:    lwr $3, 0($1)
+; MIPS64-EL-NEXT:    daddiu $4, $zero, 1
+; MIPS64-EL-NEXT:    dsll $4, $4, 32
+; MIPS64-EL-NEXT:    daddiu $4, $4, -1
+; MIPS64-EL-NEXT:    and $3, $3, $4
+; MIPS64-EL-NEXT:    or $2, $3, $2
 ; MIPS64-EL-NEXT:    lbu $1, 6($1)
 ; MIPS64-EL-NEXT:    dsll $1, $1, 48
 ; MIPS64-EL-NEXT:    ld $25, %call16(extern_func)($gp)
@@ -1079,15 +1079,15 @@ define void @pass_array_byval() nounwind {
 ; MIPS64-EB-NEXT:    daddu $1, $1, $25
 ; MIPS64-EB-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(pass_array_byval)))
 ; MIPS64-EB-NEXT:    ld $1, %got_disp(arr)($gp)
-; MIPS64-EB-NEXT:    lwl $2, 0($1)
-; MIPS64-EB-NEXT:    lwr $2, 3($1)
-; MIPS64-EB-NEXT:    dsll $2, $2, 32
-; MIPS64-EB-NEXT:    lbu $3, 5($1)
-; MIPS64-EB-NEXT:    lbu $4, 4($1)
-; MIPS64-EB-NEXT:    dsll $4, $4, 8
-; MIPS64-EB-NEXT:    or $3, $4, $3
-; MIPS64-EB-NEXT:    dsll $3, $3, 16
-; MIPS64-EB-NEXT:    or $2, $2, $3
+; MIPS64-EB-NEXT:    lbu $2, 5($1)
+; MIPS64-EB-NEXT:    dsll $2, $2, 16
+; MIPS64-EB-NEXT:    lbu $3, 4($1)
+; MIPS64-EB-NEXT:    dsll $3, $3, 24
+; MIPS64-EB-NEXT:    or $2, $3, $2
+; MIPS64-EB-NEXT:    lwl $3, 0($1)
+; MIPS64-EB-NEXT:    lwr $3, 3($1)
+; MIPS64-EB-NEXT:    dsll $3, $3, 32
+; MIPS64-EB-NEXT:    or $2, $3, $2
 ; MIPS64-EB-NEXT:    lbu $1, 6($1)
 ; MIPS64-EB-NEXT:    dsll $1, $1, 8
 ; MIPS64-EB-NEXT:    ld $25, %call16(extern_func)($gp)
@@ -1109,15 +1109,15 @@ define void @pass_array_byval() nounwind {
 ; MIPS64R2-EL-NEXT:    daddu $1, $1, $25
 ; MIPS64R2-EL-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(pass_array_byval)))
 ; MIPS64R2-EL-NEXT:    ld $1, %got_disp(arr)($gp)
-; MIPS64R2-EL-NEXT:    lwl $2, 3($1)
-; MIPS64R2-EL-NEXT:    lwr $2, 0($1)
-; MIPS64R2-EL-NEXT:    dext $2, $2, 0, 32
-; MIPS64R2-EL-NEXT:    lbu $3, 4($1)
-; MIPS64R2-EL-NEXT:    lbu $4, 5($1)
-; MIPS64R2-EL-NEXT:    dsll $4, $4, 8
-; MIPS64R2-EL-NEXT:    or $3, $4, $3
-; MIPS64R2-EL-NEXT:    dsll $3, $3, 32
-; MIPS64R2-EL-NEXT:    or $2, $2, $3
+; MIPS64R2-EL-NEXT:    lbu $2, 4($1)
+; MIPS64R2-EL-NEXT:    dsll $2, $2, 32
+; MIPS64R2-EL-NEXT:    lbu $3, 5($1)
+; MIPS64R2-EL-NEXT:    dsll $3, $3, 40
+; MIPS64R2-EL-NEXT:    or $2, $3, $2
+; MIPS64R2-EL-NEXT:    lwl $3, 3($1)
+; MIPS64R2-EL-NEXT:    lwr $3, 0($1)
+; MIPS64R2-EL-NEXT:    dext $3, $3, 0, 32
+; MIPS64R2-EL-NEXT:    or $2, $3, $2
 ; MIPS64R2-EL-NEXT:    lbu $1, 6($1)
 ; MIPS64R2-EL-NEXT:    dsll $1, $1, 48
 ; MIPS64R2-EL-NEXT:    ld $25, %call16(extern_func)($gp)
@@ -1140,10 +1140,10 @@ define void @pass_array_byval() nounwind {
 ; MIPS64R2-EB-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(pass_array_byval)))
 ; MIPS64R2-EB-NEXT:    ld $1, %got_disp(arr)($gp)
 ; MIPS64R2-EB-NEXT:    lbu $2, 5($1)
+; MIPS64R2-EB-NEXT:    dsll $2, $2, 16
 ; MIPS64R2-EB-NEXT:    lbu $3, 4($1)
-; MIPS64R2-EB-NEXT:    dsll $3, $3, 8
+; MIPS64R2-EB-NEXT:    dsll $3, $3, 24
 ; MIPS64R2-EB-NEXT:    or $2, $3, $2
-; MIPS64R2-EB-NEXT:    dsll $2, $2, 16
 ; MIPS64R2-EB-NEXT:    lwl $3, 0($1)
 ; MIPS64R2-EB-NEXT:    lwr $3, 3($1)
 ; MIPS64R2-EB-NEXT:    dext $3, $3, 0, 32

diff  --git a/llvm/test/CodeGen/Mips/unalignedload.ll b/llvm/test/CodeGen/Mips/unalignedload.ll
index a6f0a8ae0e63c..aad585fcf6eeb 100644
--- a/llvm/test/CodeGen/Mips/unalignedload.ll
+++ b/llvm/test/CodeGen/Mips/unalignedload.ll
@@ -43,14 +43,14 @@ define void @bar1() nounwind {
 ; MIPS32-EB-NEXT:    addu $gp, $2, $25
 ; MIPS32-EB-NEXT:    lw $1, %got(s2)($gp)
 ; MIPS32-EB-NEXT:    lbu $2, 3($1)
+; MIPS32-EB-NEXT:    sll $2, $2, 16
 ; MIPS32-EB-NEXT:    lbu $1, 2($1)
-; MIPS32-EB-NEXT:    sll $1, $1, 8
-; MIPS32-EB-NEXT:    or $1, $1, $2
+; MIPS32-EB-NEXT:    sll $1, $1, 24
 ; MIPS32-EB-NEXT:    lw $25, %call16(foo2)($gp)
 ; MIPS32-EB-NEXT:    .reloc ($tmp0), R_MIPS_JALR, foo2
 ; MIPS32-EB-NEXT:  $tmp0:
 ; MIPS32-EB-NEXT:    jalr $25
-; MIPS32-EB-NEXT:    sll $4, $1, 16
+; MIPS32-EB-NEXT:    or $4, $1, $2
 ; MIPS32-EB-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
 ; MIPS32-EB-NEXT:    jr $ra
 ; MIPS32-EB-NEXT:    addiu $sp, $sp, 24
@@ -130,12 +130,12 @@ define void @bar2() nounwind {
 ; MIPS32-EB-NEXT:    addu $gp, $2, $25
 ; MIPS32-EB-NEXT:    lw $1, %got(s4)($gp)
 ; MIPS32-EB-NEXT:    lwl $4, 0($1)
-; MIPS32-EB-NEXT:    lwr $4, 3($1)
 ; MIPS32-EB-NEXT:    lbu $2, 5($1)
+; MIPS32-EB-NEXT:    lwr $4, 3($1)
+; MIPS32-EB-NEXT:    sll $2, $2, 16
 ; MIPS32-EB-NEXT:    lbu $3, 4($1)
-; MIPS32-EB-NEXT:    sll $3, $3, 8
+; MIPS32-EB-NEXT:    sll $3, $3, 24
 ; MIPS32-EB-NEXT:    or $2, $3, $2
-; MIPS32-EB-NEXT:    sll $2, $2, 16
 ; MIPS32-EB-NEXT:    lbu $1, 6($1)
 ; MIPS32-EB-NEXT:    sll $1, $1, 8
 ; MIPS32-EB-NEXT:    lw $25, %call16(foo4)($gp)

diff  --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
index 0c9597a7f8635..bcf367a2b06cc 100644
--- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
@@ -1499,19 +1499,17 @@ define i64 @test_bitreverse_bswap_i64(i64 %a) nounwind {
 define i32 @pr55484(i32 %0) {
 ; RV32I-LABEL: pr55484:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: pr55484:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    slli a1, a0, 40
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    ret
 ;
@@ -1533,19 +1531,17 @@ define i32 @pr55484(i32 %0) {
 ;
 ; RV32ZBKB-LABEL: pr55484:
 ; RV32ZBKB:       # %bb.0:
-; RV32ZBKB-NEXT:    srli a1, a0, 8
-; RV32ZBKB-NEXT:    slli a0, a0, 8
-; RV32ZBKB-NEXT:    or a0, a1, a0
-; RV32ZBKB-NEXT:    slli a0, a0, 16
+; RV32ZBKB-NEXT:    slli a1, a0, 8
+; RV32ZBKB-NEXT:    slli a0, a0, 24
+; RV32ZBKB-NEXT:    or a0, a0, a1
 ; RV32ZBKB-NEXT:    srai a0, a0, 16
 ; RV32ZBKB-NEXT:    ret
 ;
 ; RV64ZBKB-LABEL: pr55484:
 ; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    srli a1, a0, 8
-; RV64ZBKB-NEXT:    slli a0, a0, 8
-; RV64ZBKB-NEXT:    or a0, a1, a0
-; RV64ZBKB-NEXT:    slli a0, a0, 48
+; RV64ZBKB-NEXT:    slli a1, a0, 40
+; RV64ZBKB-NEXT:    slli a0, a0, 56
+; RV64ZBKB-NEXT:    or a0, a0, a1
 ; RV64ZBKB-NEXT:    srai a0, a0, 48
 ; RV64ZBKB-NEXT:    ret
   %2 = lshr i32 %0, 8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index 3d3811520ad02..ad311b52015ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -419,13 +419,13 @@ define void @masked_load_v2i32_align1(<2 x i32>* %a, <2 x i32> %m, <2 x i32>* %r
 ; RV32-NEXT:  # %bb.1: # %cond.load
 ; RV32-NEXT:    lbu a3, 1(a0)
 ; RV32-NEXT:    lbu a4, 0(a0)
-; RV32-NEXT:    lbu a5, 3(a0)
-; RV32-NEXT:    lbu a6, 2(a0)
+; RV32-NEXT:    lbu a5, 2(a0)
+; RV32-NEXT:    lbu a6, 3(a0)
 ; RV32-NEXT:    slli a3, a3, 8
 ; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    slli a4, a5, 8
-; RV32-NEXT:    or a4, a4, a6
-; RV32-NEXT:    slli a4, a4, 16
+; RV32-NEXT:    slli a4, a5, 16
+; RV32-NEXT:    slli a5, a6, 24
+; RV32-NEXT:    or a4, a5, a4
 ; RV32-NEXT:    or a3, a4, a3
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
@@ -440,13 +440,13 @@ define void @masked_load_v2i32_align1(<2 x i32>* %a, <2 x i32> %m, <2 x i32>* %r
 ; RV32-NEXT:  .LBB8_3: # %cond.load1
 ; RV32-NEXT:    lbu a2, 5(a0)
 ; RV32-NEXT:    lbu a3, 4(a0)
-; RV32-NEXT:    lbu a4, 7(a0)
-; RV32-NEXT:    lbu a0, 6(a0)
+; RV32-NEXT:    lbu a4, 6(a0)
+; RV32-NEXT:    lbu a0, 7(a0)
 ; RV32-NEXT:    slli a2, a2, 8
 ; RV32-NEXT:    or a2, a2, a3
-; RV32-NEXT:    slli a3, a4, 8
-; RV32-NEXT:    or a0, a3, a0
-; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    slli a3, a4, 16
+; RV32-NEXT:    slli a0, a0, 24
+; RV32-NEXT:    or a0, a0, a3
 ; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    vmv.s.x v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
@@ -467,13 +467,13 @@ define void @masked_load_v2i32_align1(<2 x i32>* %a, <2 x i32> %m, <2 x i32>* %r
 ; RV64-NEXT:  # %bb.1: # %cond.load
 ; RV64-NEXT:    lbu a3, 1(a0)
 ; RV64-NEXT:    lbu a4, 0(a0)
-; RV64-NEXT:    lb a5, 3(a0)
-; RV64-NEXT:    lbu a6, 2(a0)
+; RV64-NEXT:    lbu a5, 2(a0)
+; RV64-NEXT:    lb a6, 3(a0)
 ; RV64-NEXT:    slli a3, a3, 8
 ; RV64-NEXT:    or a3, a3, a4
-; RV64-NEXT:    slli a4, a5, 8
-; RV64-NEXT:    or a4, a4, a6
-; RV64-NEXT:    slli a4, a4, 16
+; RV64-NEXT:    slli a4, a5, 16
+; RV64-NEXT:    slli a5, a6, 24
+; RV64-NEXT:    or a4, a5, a4
 ; RV64-NEXT:    or a3, a4, a3
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a3
@@ -488,13 +488,13 @@ define void @masked_load_v2i32_align1(<2 x i32>* %a, <2 x i32> %m, <2 x i32>* %r
 ; RV64-NEXT:  .LBB8_3: # %cond.load1
 ; RV64-NEXT:    lbu a2, 5(a0)
 ; RV64-NEXT:    lbu a3, 4(a0)
-; RV64-NEXT:    lb a4, 7(a0)
-; RV64-NEXT:    lbu a0, 6(a0)
+; RV64-NEXT:    lbu a4, 6(a0)
+; RV64-NEXT:    lb a0, 7(a0)
 ; RV64-NEXT:    slli a2, a2, 8
 ; RV64-NEXT:    or a2, a2, a3
-; RV64-NEXT:    slli a3, a4, 8
-; RV64-NEXT:    or a0, a3, a0
-; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    slli a3, a4, 16
+; RV64-NEXT:    slli a0, a0, 24
+; RV64-NEXT:    or a0, a0, a3
 ; RV64-NEXT:    or a0, a0, a2
 ; RV64-NEXT:    vmv.s.x v9, a0
 ; RV64-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma

diff  --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index c7d845867497b..819b5602ea0f4 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -392,14 +392,13 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
 ; RV64-NEXT:    lb a0, 12(a0)
 ; RV64-NEXT:    lwu a1, 8(s0)
 ; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    ld a2, 0(s0)
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    slli a0, a0, 29
 ; RV64-NEXT:    srai s1, a0, 31
-; RV64-NEXT:    slli a0, a1, 31
-; RV64-NEXT:    srli a1, a2, 33
+; RV64-NEXT:    srli a0, a2, 2
+; RV64-NEXT:    slli a1, a1, 62
 ; RV64-NEXT:    or a0, a1, a0
-; RV64-NEXT:    slli a0, a0, 31
 ; RV64-NEXT:    srai a0, a0, 31
 ; RV64-NEXT:    slli a1, a2, 31
 ; RV64-NEXT:    srai s2, a1, 31
@@ -428,14 +427,14 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
 ; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    slli a3, a1, 29
-; RV64-NEXT:    srli a3, a3, 61
-; RV64-NEXT:    sb a3, 12(s0)
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    slli a3, a2, 31
-; RV64-NEXT:    srli a3, a3, 62
-; RV64-NEXT:    or a1, a3, a1
-; RV64-NEXT:    sw a1, 8(s0)
+; RV64-NEXT:    slli a3, a1, 2
+; RV64-NEXT:    slli a4, a2, 31
+; RV64-NEXT:    srli a4, a4, 62
+; RV64-NEXT:    or a3, a4, a3
+; RV64-NEXT:    sw a3, 8(s0)
+; RV64-NEXT:    slli a1, a1, 29
+; RV64-NEXT:    srli a1, a1, 61
+; RV64-NEXT:    sb a1, 12(s0)
 ; RV64-NEXT:    slli a0, a0, 31
 ; RV64-NEXT:    srli a0, a0, 31
 ; RV64-NEXT:    slli a1, a2, 33
@@ -533,68 +532,67 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
 ;
 ; RV64M-LABEL: test_srem_vec:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    lb a1, 12(a0)
+; RV64M-NEXT:    ld a1, 0(a0)
 ; RV64M-NEXT:    lwu a2, 8(a0)
-; RV64M-NEXT:    slli a1, a1, 32
-; RV64M-NEXT:    or a1, a2, a1
-; RV64M-NEXT:    ld a3, 0(a0)
-; RV64M-NEXT:    slli a1, a1, 29
-; RV64M-NEXT:    srai a1, a1, 31
-; RV64M-NEXT:    slli a2, a2, 31
-; RV64M-NEXT:    srli a4, a3, 33
+; RV64M-NEXT:    srli a3, a1, 2
+; RV64M-NEXT:    lb a4, 12(a0)
+; RV64M-NEXT:    slli a5, a2, 62
+; RV64M-NEXT:    or a3, a5, a3
+; RV64M-NEXT:    srai a3, a3, 31
+; RV64M-NEXT:    slli a4, a4, 32
 ; RV64M-NEXT:    lui a5, %hi(.LCPI3_0)
 ; RV64M-NEXT:    ld a5, %lo(.LCPI3_0)(a5)
-; RV64M-NEXT:    or a2, a4, a2
-; RV64M-NEXT:    slli a2, a2, 31
+; RV64M-NEXT:    or a2, a2, a4
+; RV64M-NEXT:    slli a2, a2, 29
 ; RV64M-NEXT:    srai a2, a2, 31
 ; RV64M-NEXT:    mulh a4, a2, a5
 ; RV64M-NEXT:    srli a5, a4, 63
 ; RV64M-NEXT:    srai a4, a4, 1
 ; RV64M-NEXT:    add a4, a4, a5
-; RV64M-NEXT:    slli a5, a4, 3
-; RV64M-NEXT:    sub a4, a4, a5
+; RV64M-NEXT:    slli a5, a4, 2
+; RV64M-NEXT:    add a4, a5, a4
 ; RV64M-NEXT:    lui a5, %hi(.LCPI3_1)
 ; RV64M-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
-; RV64M-NEXT:    slli a3, a3, 31
-; RV64M-NEXT:    srai a3, a3, 31
+; RV64M-NEXT:    slli a1, a1, 31
+; RV64M-NEXT:    srai a1, a1, 31
 ; RV64M-NEXT:    add a2, a2, a4
-; RV64M-NEXT:    mulh a4, a1, a5
+; RV64M-NEXT:    mulh a4, a3, a5
 ; RV64M-NEXT:    srli a5, a4, 63
 ; RV64M-NEXT:    srai a4, a4, 1
 ; RV64M-NEXT:    add a4, a4, a5
-; RV64M-NEXT:    slli a5, a4, 2
-; RV64M-NEXT:    add a4, a5, a4
-; RV64M-NEXT:    add a1, a1, a4
-; RV64M-NEXT:    addi a1, a1, -2
-; RV64M-NEXT:    seqz a1, a1
+; RV64M-NEXT:    slli a5, a4, 3
+; RV64M-NEXT:    sub a4, a4, a5
+; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    addi a3, a3, -1
+; RV64M-NEXT:    seqz a3, a3
 ; RV64M-NEXT:    lui a4, %hi(.LCPI3_2)
 ; RV64M-NEXT:    ld a4, %lo(.LCPI3_2)(a4)
 ; RV64M-NEXT:    lui a5, %hi(.LCPI3_3)
 ; RV64M-NEXT:    ld a5, %lo(.LCPI3_3)(a5)
-; RV64M-NEXT:    addi a2, a2, -1
+; RV64M-NEXT:    addi a2, a2, -2
 ; RV64M-NEXT:    seqz a2, a2
-; RV64M-NEXT:    mul a3, a3, a4
-; RV64M-NEXT:    add a3, a3, a5
-; RV64M-NEXT:    slli a4, a3, 63
-; RV64M-NEXT:    srli a3, a3, 1
-; RV64M-NEXT:    or a3, a3, a4
-; RV64M-NEXT:    sltu a3, a5, a3
+; RV64M-NEXT:    mul a1, a1, a4
+; RV64M-NEXT:    add a1, a1, a5
+; RV64M-NEXT:    slli a4, a1, 63
+; RV64M-NEXT:    srli a1, a1, 1
+; RV64M-NEXT:    or a1, a1, a4
+; RV64M-NEXT:    sltu a1, a5, a1
 ; RV64M-NEXT:    addi a2, a2, -1
-; RV64M-NEXT:    addi a1, a1, -1
-; RV64M-NEXT:    neg a3, a3
-; RV64M-NEXT:    slli a4, a1, 29
-; RV64M-NEXT:    srli a4, a4, 61
-; RV64M-NEXT:    sb a4, 12(a0)
-; RV64M-NEXT:    slli a4, a2, 33
+; RV64M-NEXT:    addi a3, a3, -1
+; RV64M-NEXT:    neg a1, a1
+; RV64M-NEXT:    slli a4, a3, 33
+; RV64M-NEXT:    slli a1, a1, 31
+; RV64M-NEXT:    srli a1, a1, 31
+; RV64M-NEXT:    or a1, a1, a4
+; RV64M-NEXT:    sd a1, 0(a0)
+; RV64M-NEXT:    slli a1, a2, 2
 ; RV64M-NEXT:    slli a3, a3, 31
-; RV64M-NEXT:    srli a3, a3, 31
-; RV64M-NEXT:    or a3, a3, a4
-; RV64M-NEXT:    sd a3, 0(a0)
-; RV64M-NEXT:    slli a1, a1, 2
-; RV64M-NEXT:    slli a2, a2, 31
-; RV64M-NEXT:    srli a2, a2, 62
-; RV64M-NEXT:    or a1, a2, a1
+; RV64M-NEXT:    srli a3, a3, 62
+; RV64M-NEXT:    or a1, a3, a1
 ; RV64M-NEXT:    sw a1, 8(a0)
+; RV64M-NEXT:    slli a1, a2, 29
+; RV64M-NEXT:    srli a1, a1, 61
+; RV64M-NEXT:    sb a1, 12(a0)
 ; RV64M-NEXT:    ret
 ;
 ; RV32MV-LABEL: test_srem_vec:
@@ -714,49 +712,48 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
 ; RV64MV-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
 ; RV64MV-NEXT:    addi s0, sp, 64
 ; RV64MV-NEXT:    andi sp, sp, -32
-; RV64MV-NEXT:    lwu a1, 8(a0)
-; RV64MV-NEXT:    ld a2, 0(a0)
-; RV64MV-NEXT:    slli a3, a1, 31
-; RV64MV-NEXT:    srli a4, a2, 33
-; RV64MV-NEXT:    lb a5, 12(a0)
-; RV64MV-NEXT:    or a3, a4, a3
+; RV64MV-NEXT:    lb a1, 12(a0)
+; RV64MV-NEXT:    lwu a2, 8(a0)
+; RV64MV-NEXT:    slli a1, a1, 32
+; RV64MV-NEXT:    ld a3, 0(a0)
+; RV64MV-NEXT:    or a1, a2, a1
+; RV64MV-NEXT:    slli a1, a1, 29
+; RV64MV-NEXT:    srai a1, a1, 31
+; RV64MV-NEXT:    srli a4, a3, 2
+; RV64MV-NEXT:    slli a2, a2, 62
+; RV64MV-NEXT:    lui a5, %hi(.LCPI3_0)
+; RV64MV-NEXT:    ld a5, %lo(.LCPI3_0)(a5)
+; RV64MV-NEXT:    or a2, a2, a4
 ; RV64MV-NEXT:    slli a3, a3, 31
 ; RV64MV-NEXT:    srai a3, a3, 31
-; RV64MV-NEXT:    slli a4, a5, 32
-; RV64MV-NEXT:    or a1, a1, a4
-; RV64MV-NEXT:    lui a4, %hi(.LCPI3_0)
-; RV64MV-NEXT:    ld a4, %lo(.LCPI3_0)(a4)
-; RV64MV-NEXT:    slli a1, a1, 29
-; RV64MV-NEXT:    slli a2, a2, 31
-; RV64MV-NEXT:    srai a2, a2, 31
-; RV64MV-NEXT:    mulh a4, a2, a4
+; RV64MV-NEXT:    mulh a4, a3, a5
 ; RV64MV-NEXT:    srli a5, a4, 63
 ; RV64MV-NEXT:    add a4, a4, a5
 ; RV64MV-NEXT:    li a5, 6
 ; RV64MV-NEXT:    mul a4, a4, a5
 ; RV64MV-NEXT:    lui a5, %hi(.LCPI3_1)
 ; RV64MV-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
-; RV64MV-NEXT:    srai a1, a1, 31
-; RV64MV-NEXT:    sub a2, a2, a4
-; RV64MV-NEXT:    sd a2, 0(sp)
-; RV64MV-NEXT:    mulh a2, a1, a5
-; RV64MV-NEXT:    srli a4, a2, 63
-; RV64MV-NEXT:    srai a2, a2, 1
-; RV64MV-NEXT:    add a2, a2, a4
-; RV64MV-NEXT:    slli a4, a2, 2
+; RV64MV-NEXT:    srai a2, a2, 31
+; RV64MV-NEXT:    sub a3, a3, a4
+; RV64MV-NEXT:    sd a3, 0(sp)
+; RV64MV-NEXT:    mulh a3, a2, a5
+; RV64MV-NEXT:    srli a4, a3, 63
+; RV64MV-NEXT:    srai a3, a3, 1
+; RV64MV-NEXT:    add a3, a3, a4
+; RV64MV-NEXT:    slli a4, a3, 3
 ; RV64MV-NEXT:    lui a5, %hi(.LCPI3_2)
 ; RV64MV-NEXT:    ld a5, %lo(.LCPI3_2)(a5)
-; RV64MV-NEXT:    add a2, a4, a2
+; RV64MV-NEXT:    sub a3, a3, a4
+; RV64MV-NEXT:    add a2, a2, a3
+; RV64MV-NEXT:    sd a2, 8(sp)
+; RV64MV-NEXT:    mulh a2, a1, a5
+; RV64MV-NEXT:    srli a3, a2, 63
+; RV64MV-NEXT:    srai a2, a2, 1
+; RV64MV-NEXT:    add a2, a2, a3
+; RV64MV-NEXT:    slli a3, a2, 2
+; RV64MV-NEXT:    add a2, a3, a2
 ; RV64MV-NEXT:    add a1, a1, a2
 ; RV64MV-NEXT:    sd a1, 16(sp)
-; RV64MV-NEXT:    mulh a1, a3, a5
-; RV64MV-NEXT:    srli a2, a1, 63
-; RV64MV-NEXT:    srai a1, a1, 1
-; RV64MV-NEXT:    add a1, a1, a2
-; RV64MV-NEXT:    slli a2, a1, 3
-; RV64MV-NEXT:    sub a1, a1, a2
-; RV64MV-NEXT:    add a1, a3, a1
-; RV64MV-NEXT:    sd a1, 8(sp)
 ; RV64MV-NEXT:    mv a1, sp
 ; RV64MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64MV-NEXT:    vle64.v v8, (a1)

diff  --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index c5bb7e4b5be01..cf657bea4187c 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -64,13 +64,13 @@ define i32 @load_i32(i32* %p) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a1, 1(a0)
 ; RV32I-NEXT:    lbu a2, 0(a0)
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    lbu a0, 2(a0)
+; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a3, 8
-; RV32I-NEXT:    or a0, a2, a0
-; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    slli a2, a3, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -78,13 +78,13 @@ define i32 @load_i32(i32* %p) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    lbu a2, 0(a0)
-; RV64I-NEXT:    lb a3, 3(a0)
-; RV64I-NEXT:    lbu a0, 2(a0)
+; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    lb a0, 3(a0)
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    slli a2, a3, 8
-; RV64I-NEXT:    or a0, a2, a0
-; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    slli a2, a3, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -101,23 +101,23 @@ define i64 @load_i64(i64* %p) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a1, 1(a0)
 ; RV32I-NEXT:    lbu a2, 0(a0)
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    lbu a4, 3(a0)
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a3, 8
-; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    slli a2, a2, 16
+; RV32I-NEXT:    slli a2, a3, 16
+; RV32I-NEXT:    slli a3, a4, 24
+; RV32I-NEXT:    or a2, a3, a2
 ; RV32I-NEXT:    or a2, a2, a1
 ; RV32I-NEXT:    lbu a1, 5(a0)
 ; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a0, 6(a0)
+; RV32I-NEXT:    lbu a4, 6(a0)
+; RV32I-NEXT:    lbu a0, 7(a0)
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a3, a4, 8
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    slli a3, a4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
@@ -126,23 +126,23 @@ define i64 @load_i64(i64* %p) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    lbu a2, 0(a0)
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    lbu a4, 2(a0)
+; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    lbu a4, 3(a0)
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    slli a2, a3, 8
-; RV64I-NEXT:    or a2, a2, a4
-; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    slli a2, a3, 16
+; RV64I-NEXT:    slli a3, a4, 24
+; RV64I-NEXT:    or a2, a3, a2
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    lbu a2, 5(a0)
 ; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    lbu a4, 7(a0)
-; RV64I-NEXT:    lbu a0, 6(a0)
+; RV64I-NEXT:    lbu a4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    slli a3, a4, 8
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    slli a3, a4, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a1

diff  --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
index ee337a95754a8..2687857093414 100644
--- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
+++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
@@ -73,46 +73,41 @@ define i16 @fun1(<16 x i1> %src)
 define void @fun2(<8 x i32> %src, ptr %p)
 ; CHECK-LABEL: fun2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    stmg %r14, %r15, 112(%r15)
-; CHECK-NEXT:    .cfi_offset %r14, -48
-; CHECK-NEXT:    .cfi_offset %r15, -40
 ; CHECK-NEXT:    vlgvf %r1, %v26, 3
-; CHECK-NEXT:    vlgvf %r0, %v26, 2
+; CHECK-NEXT:    vlgvf %r5, %v24, 0
+; CHECK-NEXT:    vlgvf %r3, %v24, 1
+; CHECK-NEXT:    srlk %r0, %r1, 8
+; CHECK-NEXT:    sllg %r5, %r5, 33
+; CHECK-NEXT:    sth %r0, 28(%r2)
+; CHECK-NEXT:    rosbg %r5, %r3, 31, 55, 2
+; CHECK-NEXT:    vlgvf %r0, %v24, 2
+; CHECK-NEXT:    sllg %r4, %r3, 58
+; CHECK-NEXT:    vlgvf %r3, %v26, 2
 ; CHECK-NEXT:    stc %r1, 30(%r2)
-; CHECK-NEXT:    srlk %r3, %r1, 8
+; CHECK-NEXT:    rosbg %r4, %r0, 6, 36, 27
 ; CHECK-NEXT:    risbgn %r1, %r1, 33, 167, 0
-; CHECK-NEXT:    vlgvf %r5, %v24, 2
-; CHECK-NEXT:    rosbg %r1, %r0, 2, 32, 31
-; CHECK-NEXT:    sth %r3, 28(%r2)
+; CHECK-NEXT:    rosbg %r1, %r3, 2, 32, 31
 ; CHECK-NEXT:    srlg %r1, %r1, 24
-; CHECK-NEXT:    vlgvf %r3, %v24, 3
+; CHECK-NEXT:    rosbg %r5, %r4, 56, 63, 8
+; CHECK-NEXT:    vlgvf %r4, %v24, 3
 ; CHECK-NEXT:    st %r1, 24(%r2)
 ; CHECK-NEXT:    vlgvf %r1, %v26, 0
-; CHECK-NEXT:    risbgn %r14, %r5, 6, 164, 27
-; CHECK-NEXT:    sllg %r4, %r3, 60
-; CHECK-NEXT:    rosbg %r14, %r3, 37, 63, 60
-; CHECK-NEXT:    sllg %r3, %r14, 8
-; CHECK-NEXT:    rosbg %r4, %r1, 4, 34, 29
-; CHECK-NEXT:    rosbg %r3, %r4, 56, 63, 8
-; CHECK-NEXT:    stg %r3, 8(%r2)
-; CHECK-NEXT:    vlgvf %r3, %v24, 1
-; CHECK-NEXT:    sllg %r4, %r3, 58
-; CHECK-NEXT:    rosbg %r4, %r5, 6, 36, 27
-; CHECK-NEXT:    vlgvf %r5, %v24, 0
-; CHECK-NEXT:    sllg %r5, %r5, 25
-; CHECK-NEXT:    rosbg %r5, %r3, 39, 63, 58
-; CHECK-NEXT:    sllg %r3, %r5, 8
-; CHECK-NEXT:    rosbg %r3, %r4, 56, 63, 8
-; CHECK-NEXT:    stg %r3, 0(%r2)
-; CHECK-NEXT:    vlgvf %r3, %v26, 1
-; CHECK-NEXT:    sllg %r4, %r3, 62
-; CHECK-NEXT:    rosbg %r4, %r0, 2, 32, 31
-; CHECK-NEXT:    risbgn %r0, %r1, 4, 162, 29
-; CHECK-NEXT:    rosbg %r0, %r3, 35, 63, 62
+; CHECK-NEXT:    risbgn %r0, %r0, 6, 164, 27
+; CHECK-NEXT:    rosbg %r0, %r4, 37, 63, 60
+; CHECK-NEXT:    stg %r5, 0(%r2)
+; CHECK-NEXT:    sllg %r5, %r4, 60
 ; CHECK-NEXT:    sllg %r0, %r0, 8
+; CHECK-NEXT:    rosbg %r5, %r1, 4, 34, 29
+; CHECK-NEXT:    risbgn %r1, %r1, 4, 162, 29
+; CHECK-NEXT:    rosbg %r0, %r5, 56, 63, 8
+; CHECK-NEXT:    stg %r0, 8(%r2)
+; CHECK-NEXT:    vlgvf %r0, %v26, 1
+; CHECK-NEXT:    sllg %r4, %r0, 62
+; CHECK-NEXT:    rosbg %r1, %r0, 35, 63, 62
+; CHECK-NEXT:    sllg %r0, %r1, 8
+; CHECK-NEXT:    rosbg %r4, %r3, 2, 32, 31
 ; CHECK-NEXT:    rosbg %r0, %r4, 56, 63, 8
 ; CHECK-NEXT:    stg %r0, 16(%r2)
-; CHECK-NEXT:    lmg %r14, %r15, 112(%r15)
 ; CHECK-NEXT:    br %r14
 {
   %tmp = trunc <8 x i32> %src to <8 x i31>

diff  --git a/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
index d2470f546a5aa..aa5deb6542b2b 100644
--- a/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
@@ -31,12 +31,12 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    ldr r1, .LCPI1_0
 ; CHECK-NEXT:    muls r1, r0, r1
-; CHECK-NEXT:    lsls r0, r1, #26
+; CHECK-NEXT:    lsls r0, r1, #31
 ; CHECK-NEXT:    ldr r2, .LCPI1_1
 ; CHECK-NEXT:    ands r2, r1
 ; CHECK-NEXT:    lsrs r1, r2, #1
-; CHECK-NEXT:    adds r0, r1, r0
-; CHECK-NEXT:    lsls r0, r0, #5
+; CHECK-NEXT:    lsls r1, r1, #5
+; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    ldr r1, .LCPI1_2
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo .LBB1_2

diff  --git a/llvm/test/CodeGen/X86/bool-vector.ll b/llvm/test/CodeGen/X86/bool-vector.ll
index 2cc7fa6ba864f..e4deb878aa461 100644
--- a/llvm/test/CodeGen/X86/bool-vector.ll
+++ b/llvm/test/CodeGen/X86/bool-vector.ll
@@ -13,10 +13,10 @@ define i32 @PR15215_bad(<4 x i32> %input) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    addb %ah, %ah
+; X86-NEXT:    shlb $3, %ah
 ; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    orb %ah, %cl
 ; X86-NEXT:    shlb $2, %cl
+; X86-NEXT:    orb %ah, %cl
 ; X86-NEXT:    addb %dl, %dl
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    orb %dl, %al
@@ -28,10 +28,10 @@ define i32 @PR15215_bad(<4 x i32> %input) {
 ;
 ; X64-LABEL: PR15215_bad:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    shlb $3, %cl
 ; X64-NEXT:    andb $1, %dl
-; X64-NEXT:    orb %cl, %dl
 ; X64-NEXT:    shlb $2, %dl
+; X64-NEXT:    orb %cl, %dl
 ; X64-NEXT:    addb %sil, %sil
 ; X64-NEXT:    andb $1, %dil
 ; X64-NEXT:    orb %sil, %dil

diff  --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
index 2f6576f29d0ac..12d62bf59ae57 100644
--- a/llvm/test/CodeGen/X86/combine-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -233,13 +233,13 @@ define i32 @test_bitreverse_shli_bitreverse(i32 %a0) nounwind {
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
 ; X86-NEXT:    shrl $2, %eax
 ; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $5592405, %ecx # imm = 0x555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $22369621, %eax # imm = 0x1555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    shll $7, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $5592405, %eax # imm = 0x555555
+; X86-NEXT:    shll $6, %ecx
+; X86-NEXT:    andl $-1431655808, %ecx # imm = 0xAAAAAA80
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $986895, %ecx # imm = 0xF0F0F
@@ -276,22 +276,22 @@ define i32 @test_bitreverse_shli_bitreverse(i32 %a0) nounwind {
 ; X64-NEXT:    leal (%rdi,%rax,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $5592405, %ecx # imm = 0x555555
-; X64-NEXT:    shrl %eax
-; X64-NEXT:    andl $22369621, %eax # imm = 0x1555555
-; X64-NEXT:    leal (%rax,%rcx,2), %eax
-; X64-NEXT:    shll $7, %eax
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $986895, %ecx # imm = 0xF0F0F
-; X64-NEXT:    shll $4, %ecx
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    andl $135204623, %eax # imm = 0x80F0F0F
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $3355443, %ecx # imm = 0x333333
-; X64-NEXT:    shrl $2, %eax
-; X64-NEXT:    andl $36909875, %eax # imm = 0x2333333
-; X64-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-NEXT:    shll $6, %eax
+; X64-NEXT:    andl $-1431655808, %eax # imm = 0xAAAAAA80
+; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    andl $986895, %eax # imm = 0xF0F0F
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    shrl $4, %ecx
+; X64-NEXT:    andl $135204623, %ecx # imm = 0x80F0F0F
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    andl $3355443, %eax # imm = 0x333333
+; X64-NEXT:    shrl $2, %ecx
+; X64-NEXT:    andl $36909875, %ecx # imm = 0x2333333
+; X64-NEXT:    leal (%rcx,%rax,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X64-NEXT:    shrl %eax
@@ -322,10 +322,8 @@ define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind {
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $357913941, %ecx # imm = 0x15555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $235867919, %ecx # imm = 0xE0F0F0F

diff  --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 6ef60023eac7b..bc880ae7329ee 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -857,12 +857,13 @@ define <4 x i1> @isnan_v4f(<4 x float> %x) {
 ; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; CHECK-32-NEXT:    sahf
 ; CHECK-32-NEXT:    setp %dh
+; CHECK-32-NEXT:    shlb $2, %dh
 ; CHECK-32-NEXT:    fucomp %st(0)
 ; CHECK-32-NEXT:    fnstsw %ax
 ; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; CHECK-32-NEXT:    sahf
 ; CHECK-32-NEXT:    setp %dl
-; CHECK-32-NEXT:    addb %dl, %dl
+; CHECK-32-NEXT:    shlb $3, %dl
 ; CHECK-32-NEXT:    orb %dh, %dl
 ; CHECK-32-NEXT:    fucomp %st(0)
 ; CHECK-32-NEXT:    fnstsw %ax
@@ -876,7 +877,6 @@ define <4 x i1> @isnan_v4f(<4 x float> %x) {
 ; CHECK-32-NEXT:    setp %al
 ; CHECK-32-NEXT:    addb %al, %al
 ; CHECK-32-NEXT:    orb %dh, %al
-; CHECK-32-NEXT:    shlb $2, %al
 ; CHECK-32-NEXT:    orb %dl, %al
 ; CHECK-32-NEXT:    movb %al, (%ecx)
 ; CHECK-32-NEXT:    movl %ecx, %eax
@@ -903,11 +903,12 @@ define <4 x i1> @isnan_v4f_strictfp(<4 x float> %x) strictfp {
 ; CHECK-32-NEXT:    andl %ecx, %edx
 ; CHECK-32-NEXT:    cmpl $2139095041, %edx # imm = 0x7F800001
 ; CHECK-32-NEXT:    setge %dh
+; CHECK-32-NEXT:    shlb $2, %dh
 ; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-32-NEXT:    andl %ecx, %esi
 ; CHECK-32-NEXT:    cmpl $2139095041, %esi # imm = 0x7F800001
 ; CHECK-32-NEXT:    setge %dl
-; CHECK-32-NEXT:    addb %dl, %dl
+; CHECK-32-NEXT:    shlb $3, %dl
 ; CHECK-32-NEXT:    orb %dh, %dl
 ; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-32-NEXT:    andl %ecx, %esi
@@ -918,7 +919,6 @@ define <4 x i1> @isnan_v4f_strictfp(<4 x float> %x) strictfp {
 ; CHECK-32-NEXT:    setge %cl
 ; CHECK-32-NEXT:    addb %cl, %cl
 ; CHECK-32-NEXT:    orb %dh, %cl
-; CHECK-32-NEXT:    shlb $2, %cl
 ; CHECK-32-NEXT:    orb %dl, %cl
 ; CHECK-32-NEXT:    movb %cl, (%eax)
 ; CHECK-32-NEXT:    popl %esi

diff  --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 61980ce083c2b..7374a75992776 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -3519,11 +3519,11 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    movl 8(%rdi), %ecx
-; SSE2-NEXT:    shll $13, %ecx
+; SSE2-NEXT:    shll $28, %ecx
 ; SSE2-NEXT:    movq %rax, %rdx
 ; SSE2-NEXT:    shrq $51, %rdx
-; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    shll $15, %edx
+; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    sarl $15, %edx
 ; SSE2-NEXT:    movd %edx, %xmm1
 ; SSE2-NEXT:    shrq $34, %rax
@@ -3548,11 +3548,11 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; SSSE3-NEXT:    movd %ecx, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    movl 8(%rdi), %ecx
-; SSSE3-NEXT:    shll $13, %ecx
+; SSSE3-NEXT:    shll $28, %ecx
 ; SSSE3-NEXT:    movq %rax, %rdx
 ; SSSE3-NEXT:    shrq $51, %rdx
-; SSSE3-NEXT:    orl %ecx, %edx
 ; SSSE3-NEXT:    shll $15, %edx
+; SSSE3-NEXT:    orl %ecx, %edx
 ; SSSE3-NEXT:    sarl $15, %edx
 ; SSSE3-NEXT:    movd %edx, %xmm1
 ; SSSE3-NEXT:    shrq $34, %rax
@@ -3581,10 +3581,10 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; SSE41-NEXT:    sarl $15, %ecx
 ; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
 ; SSE41-NEXT:    movl 8(%rdi), %ecx
-; SSE41-NEXT:    shll $13, %ecx
+; SSE41-NEXT:    shll $28, %ecx
 ; SSE41-NEXT:    shrq $51, %rax
-; SSE41-NEXT:    orl %ecx, %eax
 ; SSE41-NEXT:    shll $15, %eax
+; SSE41-NEXT:    orl %ecx, %eax
 ; SSE41-NEXT:    sarl $15, %eax
 ; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
 ; SSE41-NEXT:    retq
@@ -3607,10 +3607,10 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
 ; AVX-NEXT:    sarl $15, %ecx
 ; AVX-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
 ; AVX-NEXT:    movl 8(%rdi), %ecx
-; AVX-NEXT:    shll $13, %ecx
+; AVX-NEXT:    shll $28, %ecx
 ; AVX-NEXT:    shrq $51, %rax
-; AVX-NEXT:    orl %ecx, %eax
 ; AVX-NEXT:    shll $15, %eax
+; AVX-NEXT:    orl %ecx, %eax
 ; AVX-NEXT:    sarl $15, %eax
 ; AVX-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    retq