[llvm] 69d5a03 - [DAG] Enable ISD::SRL SimplifyMultipleUseDemandedBits handling inside SimplifyDemandedBits

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 28 06:11:08 PDT 2022


Author: Simon Pilgrim
Date: 2022-07-28T14:10:44+01:00
New Revision: 69d5a038b90d3616a5c22d4e0e682aad4679758d

URL: https://github.com/llvm/llvm-project/commit/69d5a038b90d3616a5c22d4e0e682aad4679758d
DIFF: https://github.com/llvm/llvm-project/commit/69d5a038b90d3616a5c22d4e0e682aad4679758d.diff

LOG: [DAG] Enable ISD::SRL SimplifyMultipleUseDemandedBits handling inside SimplifyDemandedBits

This patch allows SimplifyDemandedBits to call SimplifyMultipleUseDemandedBits in cases where the ISD::SRL source operand has other uses, enabling us to peek through the shifted value if we don't demand all the bits/elts.

This is another step towards removing SelectionDAG::GetDemandedBits and just using TargetLowering::SimplifyMultipleUseDemandedBits.

There a few cases where we end up with extra register moves which I think we can accept in exchange for the increased ILP.

Differential Revision: https://reviews.llvm.org/D77804

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/AArch64/parity.ll
    llvm/test/CodeGen/AArch64/shift-accumulate.ll
    llvm/test/CodeGen/AMDGPU/bswap.ll
    llvm/test/CodeGen/AMDGPU/ds-alignment.ll
    llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
    llvm/test/CodeGen/AMDGPU/fabs.f16.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/idot4s.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/saddsat.ll
    llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
    llvm/test/CodeGen/AMDGPU/shift-i128.ll
    llvm/test/CodeGen/AMDGPU/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
    llvm/test/CodeGen/AMDGPU/trunc-combine.ll
    llvm/test/CodeGen/AMDGPU/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/usubsat.ll
    llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
    llvm/test/CodeGen/ARM/parity.ll
    llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll
    llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
    llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
    llvm/test/CodeGen/RISCV/rv64zbb-zbp-zbkb.ll
    llvm/test/CodeGen/RISCV/rv64zbb.ll
    llvm/test/CodeGen/RISCV/rv64zbp.ll
    llvm/test/CodeGen/RISCV/sextw-removal.ll
    llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
    llvm/test/CodeGen/X86/bitreverse.ll
    llvm/test/CodeGen/X86/ctpop-combine.ll
    llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
    llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll
    llvm/test/CodeGen/X86/load-local-v4i5.ll
    llvm/test/CodeGen/X86/masked_compressstore.ll
    llvm/test/CodeGen/X86/vector-mulfix-legalize.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index be6a1785ca14f..1bc595d11ffa3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19053,16 +19053,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
         APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
                              ST->getMemoryVT().getScalarSizeInBits());
 
-    // See if we can simplify the input to this truncstore with knowledge that
-    // only the low bits are being used.  For example:
-    // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
+    // See if we can simplify the operation with SimplifyDemandedBits, which
+    // only works if the value has a single use.
     AddToWorklist(Value.getNode());
-    if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits))
-      return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
-                               ST->getMemOperand());
-
-    // Otherwise, see if we can simplify the operation with
-    // SimplifyDemandedBits, which only works if the value has a single use.
     if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
       // Re-visit the store if anything changed and the store hasn't been merged
       // with another node (N is deleted) SimplifyDemandedBits will add Value's
@@ -19072,6 +19065,13 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
         AddToWorklist(N);
       return SDValue(N, 0);
     }
+
+    // Otherwise, see if we can simplify the input to this truncstore with
+    // knowledge that only the low bits are being used.  For example:
+    // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
+    if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits))
+      return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
+                               ST->getMemOperand());
   }
 
   // If this is a load followed by a store to the same location, then the store

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4a960c4e15b41..893795c3decab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2479,24 +2479,6 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits) {
       return getConstant(NewVal, SDLoc(V), V.getValueType());
     break;
   }
-  case ISD::SRL:
-    // Only look at single-use SRLs.
-    if (!V.getNode()->hasOneUse())
-      break;
-    if (auto *RHSC = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
-      // See if we can recursively simplify the LHS.
-      unsigned Amt = RHSC->getZExtValue();
-
-      // Watch out for shift count overflow though.
-      if (Amt >= DemandedBits.getBitWidth())
-        break;
-      APInt SrcDemandedBits = DemandedBits << Amt;
-      if (SDValue SimplifyLHS = TLI->SimplifyMultipleUseDemandedBits(
-              V.getOperand(0), SrcDemandedBits, *this))
-        return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS,
-                       V.getOperand(1));
-    }
-    break;
   }
   return SDValue();
 }

diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index bd471f23d512d..11ad90faa7569 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1880,6 +1880,16 @@ bool TargetLowering::SimplifyDemandedBits(
       Known.One.lshrInPlace(ShAmt);
       // High bits known zero.
       Known.Zero.setHighBits(ShAmt);
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+        SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+            Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1);
+        if (DemandedOp0) {
+          SDValue NewOp = TLO.DAG.getNode(ISD::SRL, dl, VT, DemandedOp0, Op1);
+          return TLO.CombineTo(Op, NewOp);
+        }
+      }
     }
     break;
   }

diff  --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll
index b281cb03e05bf..534892d0a24d1 100644
--- a/llvm/test/CodeGen/AArch64/parity.ll
+++ b/llvm/test/CodeGen/AArch64/parity.ll
@@ -47,8 +47,8 @@ define i17 @parity_17(i17 %x) {
 ; CHECK-LABEL: parity_17:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0x1ffff
-; CHECK-NEXT:    eor w8, w8, w8, lsr #16
-; CHECK-NEXT:    eor w8, w8, w8, lsr #8
+; CHECK-NEXT:    eor w9, w8, w8, lsr #16
+; CHECK-NEXT:    eor w8, w9, w8, lsr #8
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #4
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #2
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #1

diff  --git a/llvm/test/CodeGen/AArch64/shift-accumulate.ll b/llvm/test/CodeGen/AArch64/shift-accumulate.ll
index 7f18fdbbe8b82..e3dd6511c93e3 100644
--- a/llvm/test/CodeGen/AArch64/shift-accumulate.ll
+++ b/llvm/test/CodeGen/AArch64/shift-accumulate.ll
@@ -92,8 +92,8 @@ define <4 x i32> @ssra_v4i32(<4 x i32> %0) {
 define <1 x i64> @ssra_v1i64(<2 x i32> %0) {
 ; CHECK-LABEL: ssra_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic v0.2s, #64, lsl #24
 ; CHECK-NEXT:    ushr d1, d0, #63
+; CHECK-NEXT:    bic v0.2s, #64, lsl #24
 ; CHECK-NEXT:    ssra d1, d0, #62
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
@@ -108,8 +108,8 @@ define <1 x i64> @ssra_v1i64(<2 x i32> %0) {
 define <2 x i64> @ssra_v2i64(<4 x i32> %0) {
 ; CHECK-LABEL: ssra_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic v0.4s, #64, lsl #24
 ; CHECK-NEXT:    ushr v1.2d, v0.2d, #63
+; CHECK-NEXT:    bic v0.4s, #64, lsl #24
 ; CHECK-NEXT:    ssra v1.2d, v0.2d, #62
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index 784336a05da1c..368463c1ac5c3 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -463,10 +463,10 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; SI-NEXT:    v_bfi_b32 v0, s4, v0, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bswap_v2i16:
@@ -530,12 +530,12 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
 ; SI-NEXT:    v_bfi_b32 v2, s4, v2, v7
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v4
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v2, v2, v5
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bswap_v4i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index 28b830903c59b..a05b764408788 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -209,27 +209,27 @@ define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addr
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:1
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:2
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:3
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:4
-; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:5
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
+; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:6
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:7
-; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v4 offset:2
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v5 offset:3
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v2
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v3 offset:1
+; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v7, s1
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v5 offset:4
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v6 offset:5
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v1
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v2 offset:1
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v8 offset:6
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v8 offset:6
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v0 offset:7
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v6 offset:4
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v7 offset:5
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v0 offset:7
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v3 offset:2
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v4 offset:3
 ; ALIGNED-SDAG-NEXT:    s_endpgm
 ;
 ; ALIGNED-GISEL-LABEL: ds8align1:
@@ -294,19 +294,19 @@ define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addr
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:2
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:4
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:6
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:4
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:2
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v1 offset:2
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v1 offset:4
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v2
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v3 offset:6
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v0 offset:4
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v0 offset:2
 ; ALIGNED-SDAG-NEXT:    s_endpgm
 ;
 ; ALIGNED-GISEL-LABEL: ds8align2:
@@ -399,12 +399,14 @@ define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v5 offset:4
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v6 offset:5
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v0 offset:11
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v9 offset:8
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v10 offset:9
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v0 offset:11
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v1
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v2 offset:1
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v3 offset:2
@@ -495,16 +497,16 @@ define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0 offset:2
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:4
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:10
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:8
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:8
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:10
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:6
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v6, s1
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v3 offset:4
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v4 offset:10
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v4 offset:8
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v5 offset:8
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v5 offset:10
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v1
 ; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v2 offset:2
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 6456c87a31fbf..9c3c09d874851 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -12,9 +12,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -29,10 +29,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB0_3
 ; SI-NEXT:    s_branch .LBB0_4
@@ -48,9 +47,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -63,10 +62,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v4, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT:    v_or_b32_e32 v2, v2, v0
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; SI-NEXT:  .LBB0_4: ; %exit
 ; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
 ; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
@@ -83,11 +81,10 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v2, v3, v4
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_8xi16_extract_4xi16:
@@ -161,28 +158,26 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
+; SI-NEXT:    v_or_b32_e32 v5, v5, v7
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB1_3
 ; SI-NEXT:    s_branch .LBB1_4
 ; SI-NEXT:  .LBB1_2:
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB1_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -197,25 +192,23 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v4, v0
-; SI-NEXT:    v_or_b32_e32 v3, v3, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v0
+; SI-NEXT:    v_or_b32_e32 v5, v5, v1
 ; SI-NEXT:  .LBB1_4: ; %exit
-; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
-; SI-NEXT:    v_bfe_i32 v1, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v0, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v3, v3, 0, 16
 ; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
 ; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
 ; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
@@ -224,14 +217,14 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v2, v3, v4
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
@@ -314,19 +307,18 @@ define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x h
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v5
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_or_b32_e32 v4, v4, v3
+; SI-NEXT:    v_or_b32_e32 v4, v4, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB2_3
 ; SI-NEXT:    s_branch .LBB2_4
 ; SI-NEXT:  .LBB2_2:
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB2_3: ; %T
@@ -355,12 +347,11 @@ define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x h
 ; SI-NEXT:    v_or_b32_e32 v0, v4, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:  .LBB2_4: ; %exit
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3fa00000
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f200000
@@ -444,9 +435,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -477,10 +468,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB3_3
 ; SI-NEXT:    s_branch .LBB3_4
@@ -496,9 +486,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -527,10 +517,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v4, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT:    v_or_b32_e32 v2, v2, v0
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; SI-NEXT:  .LBB3_4: ; %exit
 ; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
 ; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
@@ -547,11 +536,10 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v2, v3, v4
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xi16_extract_4xi16:
@@ -641,13 +629,13 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -665,20 +653,18 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_or_b32_e32 v2, v7, v2
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB4_3
 ; SI-NEXT:    s_branch .LBB4_4
 ; SI-NEXT:  .LBB4_2:
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB4_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -695,9 +681,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -718,16 +704,14 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v4, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT:    v_or_b32_e32 v2, v2, v0
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; SI-NEXT:  .LBB4_4: ; %exit
 ; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
-; SI-NEXT:    v_bfe_i32 v1, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
 ; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v3, v5, 0, 16
 ; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
 ; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
@@ -743,7 +727,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xi16_extract_4xi16_2:
@@ -858,19 +842,18 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16
 ; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v5
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_or_b32_e32 v4, v4, v3
+; SI-NEXT:    v_or_b32_e32 v4, v4, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    s_mov_b64 vcc, exec
 ; SI-NEXT:    s_cbranch_execz .LBB5_3
 ; SI-NEXT:    s_branch .LBB5_4
 ; SI-NEXT:  .LBB5_2:
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_mov_b64 vcc, 0
 ; SI-NEXT:  .LBB5_3: ; %T
@@ -915,12 +898,11 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16
 ; SI-NEXT:    v_or_b32_e32 v0, v4, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:  .LBB5_4: ; %exit
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3fa00000
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f200000

diff  --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 0423a3a5e27a1..dd2e41f425f58 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -535,7 +535,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(<2 x half> addrspace(1)*
 ; CI-NEXT:    flat_load_dword v0, v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_bfe_u32 v1, v0, 16, 15
-; CI-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; CI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; CI-NEXT:    flat_store_short v[0:1], v0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    flat_store_short v[0:1], v1

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 03f7b2bd1869f..dee950c5c0ddc 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -802,14 +802,14 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v1, v1, v3, v5
-; SI-NEXT:    v_or_b32_e32 v3, 16, v4
+; SI-NEXT:    v_or_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_v2i16:
@@ -1021,17 +1021,17 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; SI-NEXT:    v_or_b32_e32 v4, 16, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; SI-NEXT:    v_alignbit_b32 v3, v3, v5, v4
-; SI-NEXT:    v_or_b32_e32 v4, 16, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; SI-NEXT:    v_alignbit_b32 v2, v2, v5, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v5, 16, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_alignbit_b32 v2, v2, v6, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_v4i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index b44d8bd9276e5..aafa3b3a1d4d5 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -963,34 +963,30 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 16, 8
 ; GFX7-NEXT:    v_bfe_i32 v4, v2, 0, 8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 24, v2
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
+; GFX7-NEXT:    v_bfe_i32 v6, v0, 16, 8
 ; GFX7-NEXT:    v_bfe_i32 v7, v0, 0, 8
-; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX7-NEXT:    v_bfe_i32 v8, v0, 16, 8
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v4
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v8, 24, v0
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, 0, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v0, 0, v0, 16
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
-; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
-; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v8, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v7, v1
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v8, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 693d5f62975fb..2710aa098722c 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1851,27 +1851,23 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xff00, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xff00, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v0
-; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GFX7-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT:    v_alignbit_b32 v3, s10, v3, 16
+; GFX7-NEXT:    v_alignbit_b32 v6, 0, v6, 16
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
-; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
-; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v7, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v8, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -2143,18 +2139,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v8, v4, v5
-; GFX9-NODL-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX9-NODL-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v7, v4, v5
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 8, v6
+; GFX9-NODL-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v7
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
-; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
 ; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v6
+; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
+; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v8
 ; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -2173,18 +2167,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v8, v4, v5
-; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX9-DL-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v4, v5
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v6
+; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v7
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
+; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v8
 ; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2203,24 +2195,23 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-DL-NEXT:    v_lshrrev_b16 v8, 8, v2
+; GFX10-DL-NEXT:    v_lshrrev_b16 v6, 8, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX10-DL-NEXT:    v_lshrrev_b16 v9, 8, v2
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    v_lshrrev_b16 v5, 8, v1
-; GFX10-DL-NEXT:    v_mul_lo_u16 v9, v6, v7
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
+; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v7, v8
+; GFX10-DL-NEXT:    v_mul_lo_u16 v6, v6, v9
 ; GFX10-DL-NEXT:    v_lshlrev_b16 v4, 8, v4
-; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v5, v8
-; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-DL-NEXT:    v_lshlrev_b16 v6, 8, v6
+; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
-; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v5
-; GFX10-DL-NEXT:    v_mad_u16 v1, v6, v7, v1
+; GFX10-DL-NEXT:    v_mad_u16 v1, v7, v8, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 40939e300e199..b41bebb03b301 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -2204,60 +2204,48 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_bfe_i32 v3, v2, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v5, v2, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v6, v2, 0, 4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX7-NEXT:    v_bfe_i32 v8, v2, 0, 4
+; GFX7-NEXT:    v_bfe_i32 v6, v2, 4, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_bfe_i32 v10, v0, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v11, v0, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v12, v0, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v13, v0, 0, 4
-; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v13
-; GFX7-NEXT:    v_bfe_i32 v14, v0, 24, 4
-; GFX7-NEXT:    v_ashrrev_i32_e32 v16, 28, v0
-; GFX7-NEXT:    v_or_b32_e32 v5, v6, v5
-; GFX7-NEXT:    v_or_b32_e32 v6, v11, v10
-; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff, v14
-; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
+; GFX7-NEXT:    v_bfe_i32 v15, v0, 0, 4
+; GFX7-NEXT:    v_bfe_i32 v13, v0, 4, 4
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX7-NEXT:    v_bfe_i32 v5, v2, 8, 4
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX7-NEXT:    v_bfe_i32 v8, v2, 8, 4
-; GFX7-NEXT:    v_bfe_i32 v15, v0, 8, 4
+; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v6, v1
-; GFX7-NEXT:    v_bfe_i32 v7, v2, 24, 4
+; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 24, 4
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 20, 4
+; GFX7-NEXT:    v_bfe_i32 v7, v2, 16, 4
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 28, v2
 ; GFX7-NEXT:    v_bfe_i32 v2, v2, 12, 4
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX7-NEXT:    v_bfe_i32 v10, v0, 24, 4
+; GFX7-NEXT:    v_bfe_i32 v11, v0, 20, 4
+; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v16, 28, v0
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 12, 4
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v15
-; GFX7-NEXT:    v_mad_u32_u24 v1, v16, v11, v1
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v13, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v5, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX7-NEXT:    v_mad_u32_u24 v0, v15, v10, v0
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v12, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v14, v0
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -2844,84 +2832,50 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 28, v2
-; GFX7-NEXT:    v_bfe_i32 v4, v2, 24, 4
-; GFX7-NEXT:    v_bfe_i32 v5, v2, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v7, v2, 12, 4
-; GFX7-NEXT:    v_bfe_i32 v8, v2, 8, 4
-; GFX7-NEXT:    v_bfe_i32 v9, v2, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX7-NEXT:    v_bfe_i32 v7, v2, 0, 4
+; GFX7-NEXT:    v_bfe_i32 v3, v2, 24, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_ashrrev_i32_e32 v10, 28, v0
-; GFX7-NEXT:    v_bfe_i32 v11, v0, 24, 4
-; GFX7-NEXT:    v_bfe_i32 v12, v0, 20, 4
-; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
-; GFX7-NEXT:    v_bfe_i32 v14, v0, 12, 4
-; GFX7-NEXT:    v_bfe_i32 v15, v0, 8, 4
-; GFX7-NEXT:    v_bfe_i32 v16, v0, 4, 4
-; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 4
-; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v8, v7
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v10
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v12
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v13
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v14
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v16
+; GFX7-NEXT:    v_bfe_i32 v14, v0, 0, 4
+; GFX7-NEXT:    v_bfe_i32 v4, v2, 20, 4
+; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 4
+; GFX7-NEXT:    v_bfe_i32 v6, v2, 8, 4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v8, 28, v2
+; GFX7-NEXT:    v_bfe_i32 v9, v2, 12, 4
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 4, 4
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX7-NEXT:    v_bfe_i32 v10, v0, 24, 4
+; GFX7-NEXT:    v_bfe_i32 v11, v0, 20, 4
+; GFX7-NEXT:    v_bfe_i32 v12, v0, 16, 4
+; GFX7-NEXT:    v_bfe_i32 v13, v0, 8, 4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v15, 28, v0
+; GFX7-NEXT:    v_bfe_i32 v16, v0, 12, 4
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 4, 4
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX7-NEXT:    v_or_b32_e32 v7, v9, v8
-; GFX7-NEXT:    v_or_b32_e32 v8, v11, v10
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v13
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v2
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX7-NEXT:    v_bfe_u32 v9, v2, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v13, v1
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v14, v1
+; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v4
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v5
-; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v12, v0
-; GFX7-NEXT:    v_bfe_u32 v11, v4, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v16, v5, 8, 8
-; GFX7-NEXT:    v_mad_u32_u24 v0, v10, v15, v0
-; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
-; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
-; GFX7-NEXT:    v_mad_u32_u24 v0, v11, v16, v0
-; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
-; GFX7-NEXT:    v_bfe_u32 v6, v6, 8, 8
-; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v5, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
+; GFX7-NEXT:    v_alignbit_b32 v9, 0, v9, 24
+; GFX7-NEXT:    v_alignbit_b32 v16, 0, v16, 24
+; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
 ; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -3051,6 +3005,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 20, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
@@ -3058,63 +3014,60 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 12, v9
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v15
+; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
+; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 12, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v14
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v17
+; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
+; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v18, 12, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 12, v10
-; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
-; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
-; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
-; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
-; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
-; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
+; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v13, v16, v18
+; GFX9-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
-; GFX9-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX9-NEXT:    v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v1, v7, v4
-; GFX9-NEXT:    v_add_u16_e32 v1, v1, v2
+; GFX9-NEXT:    v_add_u16_e32 v2, v7, v4
+; GFX9-NEXT:    v_add_u16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_add_u16_e32 v1, v1, v6
 ; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v16, v18, v0
 ; GFX9-NEXT:    v_add_u16_e32 v0, v0, v5
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v15, v17, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, v0, v9
+; GFX9-NEXT:    v_add_u16_e32 v0, v0, v8
 ; GFX9-NEXT:    global_store_byte v3, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3144,6 +3097,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 20, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
@@ -3151,63 +3106,60 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v2, 12, v9
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v15
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v1
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v1, 12, v5
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v14
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v17
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
+; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v18, 12, v0
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v0, 12, v10
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
-; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v13, v16, v18
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
-; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-DL-NEXT:    v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
-; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX9-DL-NEXT:    v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-DL-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GFX9-DL-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u16_e32 v1, v7, v4
-; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_add_u16_e32 v2, v7, v4
+; GFX9-DL-NEXT:    v_add_u16_e32 v1, v2, v1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
 ; GFX9-DL-NEXT:    v_add_u16_e32 v0, v1, v0
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v16, v18, v0
 ; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v5
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v15, v17, v0
-; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v9
+; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v8
 ; GFX9-DL-NEXT:    global_store_byte v3, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -3234,68 +3186,67 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v0, 20, v1
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
 ; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v17
 ; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v9, v16
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
-; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
 ; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
 ; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
+; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
 ; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v0, v11
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v1, v1, v2
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v2, v0, v11
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 8, v10
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v10, v5, v12
 ; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v11, v7, v14
+; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 8, v2
 ; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 8, v10
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v1, v1, v2
-; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v2, v5, v12
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 8, v9
-; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 8, v11
+; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 8, v13
 ; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v1, v3
 ; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -3334,78 +3285,77 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 20, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v17, 4, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v17, 4, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v18, 12, v0
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v16
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v3, 20, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 20, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v3, 12, v3
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v17
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v0, v9, v0
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v9, v9, v16
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v3
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v11
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
-; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
 ; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
-; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v12
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v3, v3, v9
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v9, v7, v14
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 8, v10
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
+; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v3, v3, v11
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v1, v1, v0
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 8, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v1, v1, v18
-; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v12, v5, v11
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v10, v5, v12
+; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v11, v7, v14
 ; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
+; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 8, v13
 ; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v2, v1, v2
-; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v9, v2, v9
+; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v9, v2, v10
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v9, v8
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v5, v11, v0
+; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v5, v12, v0
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v7, v14, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index f9d7f537f2953..7021bcc803ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -2121,42 +2121,30 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_bfe_u32 v8, v2, 20, 4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 12, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT:    v_bfe_u32 v5, v2, 12, 4
-; GFX7-NEXT:    v_bfe_u32 v6, v2, 8, 4
-; GFX7-NEXT:    v_and_b32_e32 v7, 15, v2
-; GFX7-NEXT:    v_alignbit_b32 v2, v8, v2, 16
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xf0000, v9
+; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
+; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
+; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
+; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
+; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
+; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 12, v0
-; GFX7-NEXT:    v_and_b32_e32 v14, 15, v0
-; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xf0000, v9
-; GFX7-NEXT:    v_or_b32_e32 v8, v14, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v7, 15, v7
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
-; GFX7-NEXT:    v_and_b32_e32 v8, 15, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v8, v1
-; GFX7-NEXT:    v_bfe_u32 v13, v0, 8, 4
-; GFX7-NEXT:    v_bfe_u32 v15, v0, 20, 4
-; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v14, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
 ; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
-; GFX7-NEXT:    v_bfe_u32 v12, v0, 12, 4
-; GFX7-NEXT:    v_alignbit_b32 v0, v15, v0, 16
-; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
+; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
+; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
+; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
+; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT:    v_mad_u32_u24 v0, v16, v15, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -2478,70 +2466,40 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
 ; GFX7-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 12, v2
-; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 28, v2
-; GFX7-NEXT:    v_bfe_u32 v7, v2, 16, 4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, 15, v2
+; GFX7-NEXT:    v_bfe_u32 v6, v2, 4, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xf00, v8
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xf00, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, 15, v2
-; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 4
-; GFX7-NEXT:    v_and_b32_e32 v12, 15, v0
-; GFX7-NEXT:    v_bfe_u32 v14, v0, 16, 4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
-; GFX7-NEXT:    v_alignbit_b32 v2, v6, v2, 24
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xf00, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v0
-; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_alignbit_b32 v0, v13, v0, 24
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xf00, v11
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xf00, v15
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xf00, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xf0f, v0
-; GFX7-NEXT:    v_or_b32_e32 v8, v10, v8
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xf0f, v2
-; GFX7-NEXT:    v_or_b32_e32 v4, v14, v4
-; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT:    v_or_b32_e32 v4, v6, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT:    v_and_b32_e32 v7, 15, v3
-; GFX7-NEXT:    v_and_b32_e32 v13, 15, v4
-; GFX7-NEXT:    v_bfe_u32 v8, v3, 8, 4
-; GFX7-NEXT:    v_bfe_u32 v14, v4, 8, 4
+; GFX7-NEXT:    v_and_b32_e32 v14, 15, v0
+; GFX7-NEXT:    v_bfe_u32 v8, v2, 12, 4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v2
+; GFX7-NEXT:    v_bfe_u32 v13, v0, 4, 4
+; GFX7-NEXT:    v_bfe_u32 v15, v0, 12, 4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v13, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
-; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
-; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 4
-; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v14, v1
-; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
-; GFX7-NEXT:    v_and_b32_e32 v9, 15, v2
-; GFX7-NEXT:    v_and_b32_e32 v15, 15, v0
-; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v11, v1
-; GFX7-NEXT:    v_bfe_u32 v10, v2, 8, 4
-; GFX7-NEXT:    v_bfe_u32 v16, v0, 8, 4
-; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v15, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
-; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 4
-; GFX7-NEXT:    v_mad_u32_u24 v1, v10, v16, v1
+; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT:    v_bfe_u32 v3, v2, 20, 4
+; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 4
+; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 4
+; GFX7-NEXT:    v_bfe_u32 v12, v0, 8, 4
+; GFX7-NEXT:    v_alignbit_b32 v2, v9, v2, 24
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 24, v15
+; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT:    v_alignbit_b32 v8, 0, v8, 24
+; GFX7-NEXT:    v_alignbit_b32 v7, 0, v9, 24
+; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT:    v_bfe_u32 v11, v0, 16, 4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 28, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v7, v1
+; GFX7-NEXT:    v_bfe_u32 v10, v0, 20, 4
+; GFX7-NEXT:    v_alignbit_b32 v0, v16, v0, 24
+; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 8, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v12, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, v15, v9, v0
 ; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -2635,52 +2593,51 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    global_load_ubyte v4, v3, s[2:3]
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v10, v1, 20, 4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v0, v1, 4, 4
 ; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
 ; GFX9-NEXT:    v_bfe_u32 v6, v1, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
+; GFX9-NEXT:    v_bfe_u32 v9, v1, 24, 4
+; GFX9-NEXT:    v_bfe_u32 v10, v1, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v11, v1, 16, 4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_bfe_u32 v1, v2, 4, 4
 ; GFX9-NEXT:    v_and_b32_e32 v12, 15, v2
 ; GFX9-NEXT:    v_bfe_u32 v13, v2, 12, 4
 ; GFX9-NEXT:    v_bfe_u32 v14, v2, 8, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
+; GFX9-NEXT:    v_bfe_u32 v16, v2, 24, 4
+; GFX9-NEXT:    v_bfe_u32 v17, v2, 20, 4
 ; GFX9-NEXT:    v_bfe_u32 v2, v2, 16, 4
+; GFX9-NEXT:    v_mul_lo_u16_e32 v18, v11, v2
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v17, v9, v16
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v18, v11, v2
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v8, v17, v8
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v1, v18, v10
+; GFX9-NEXT:    v_or_b32_e32 v0, v18, v10
+; GFX9-NEXT:    v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX9-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v12
-; GFX9-NEXT:    v_or_b32_e32 v7, v12, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT:    v_or_b32_e32 v10, v12, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v1, v5, v4
-; GFX9-NEXT:    v_add_u16_e32 v1, v1, v7
+; GFX9-NEXT:    v_add_u16_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_u16_e32 v1, v4, v1
 ; GFX9-NEXT:    v_add_u16_e32 v1, v1, v6
 ; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v11, v2, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, v0, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
-; GFX9-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
 ; GFX9-NEXT:    v_add_u16_e32 v0, v0, v8
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
+; GFX9-NEXT:    v_add_u16_e32 v0, v0, v7
 ; GFX9-NEXT:    global_store_byte v3, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2701,52 +2658,51 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    global_load_ubyte v4, v3, s[2:3]
 ; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 20, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
 ; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
 ; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
+; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 24, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_bfe_u32 v1, v2, 4, 4
 ; GFX9-DL-NEXT:    v_and_b32_e32 v12, 15, v2
 ; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 12, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 8, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
+; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 24, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 20, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 16, 4
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v18, v11, v2
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v17, v9, v16
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v18, v11, v2
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_e32 v8, v17, v8
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_e32 v1, v18, v10
+; GFX9-DL-NEXT:    v_or_b32_e32 v0, v18, v10
+; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_e32 v5, v5, v12
-; GFX9-DL-NEXT:    v_or_b32_e32 v7, v12, v0
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-DL-NEXT:    v_or_b32_e32 v10, v12, v0
 ; GFX9-DL-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u16_e32 v1, v5, v4
-; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v7
+; GFX9-DL-NEXT:    v_add_u16_e32 v4, v5, v4
+; GFX9-DL-NEXT:    v_add_u16_e32 v1, v4, v1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
 ; GFX9-DL-NEXT:    v_add_u16_e32 v0, v1, v0
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v11, v2, v0
-; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v10
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
 ; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v8
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
+; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v7
 ; GFX9-DL-NEXT:    global_store_byte v3, v0, s[2:3]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2770,53 +2726,52 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
-; GFX10-DL-NEXT:    v_mul_lo_u16 v6, v6, v10
+; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 12, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
 ; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v1
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 24, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
+; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 24, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 20, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v12, v1, 16, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v1, v2, 4, 4
-; GFX10-DL-NEXT:    v_mul_lo_u16 v7, v7, v13
+; GFX10-DL-NEXT:    v_bfe_u32 v1, v2, 8, 4
+; GFX10-DL-NEXT:    v_mul_lo_u16 v6, v6, v9
+; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
+; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 20, 4
+; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v7, v1
 ; GFX10-DL-NEXT:    v_lshlrev_b16 v6, 8, v6
-; GFX10-DL-NEXT:    v_and_b32_e32 v10, 15, v2
-; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 24, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v16, v2, 16, 4
-; GFX10-DL-NEXT:    v_mul_lo_u16 v2, v8, v14
-; GFX10-DL-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX10-DL-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v11, v13
-; GFX10-DL-NEXT:    v_mul_lo_u16 v7, v9, v15
-; GFX10-DL-NEXT:    v_lshlrev_b16 v2, 8, v2
-; GFX10-DL-NEXT:    v_lshlrev_b16 v8, 8, v0
+; GFX10-DL-NEXT:    v_and_b32_e32 v13, 15, v2
+; GFX10-DL-NEXT:    v_mul_lo_u16 v0, v0, v9
+; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 16, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v16, v2, 24, 4
+; GFX10-DL-NEXT:    v_or_b32_e32 v6, v1, v6
+; GFX10-DL-NEXT:    v_mul_lo_u16 v2, v11, v15
+; GFX10-DL-NEXT:    v_mul_lo_u16 v8, v8, v14
+; GFX10-DL-NEXT:    v_lshlrev_b16 v9, 8, v0
+; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v5, v13
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v5, v10
-; GFX10-DL-NEXT:    v_mul_lo_u16 v10, v12, v16
-; GFX10-DL-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-DL-NEXT:    v_or_b32_e32 v7, v7, v2
-; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_or_b32_e32 v5, v5, v8
-; GFX10-DL-NEXT:    v_or_b32_e32 v1, v10, v1
-; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v12, v7
+; GFX10-DL-NEXT:    v_mul_lo_u16 v11, v10, v16
+; GFX10-DL-NEXT:    v_lshlrev_b16 v2, 8, v2
+; GFX10-DL-NEXT:    v_lshlrev_b16 v8, 8, v8
+; GFX10-DL-NEXT:    v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_or_b32_e32 v5, v5, v9
+; GFX10-DL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v13
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v5, v3
-; GFX10-DL-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_add_nc_u16 v5, v3, v2
+; GFX10-DL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_add_nc_u16 v5, v3, v9
 ; GFX10-DL-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v5, v6
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX10-DL-NEXT:    v_mad_u16 v0, v12, v16, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v12, v7, v0
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v7
-; GFX10-DL-NEXT:    v_mad_u16 v0, v9, v15, v0
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX10-DL-NEXT:    v_mad_u16 v0, v10, v16, v0
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-DL-NEXT:    global_store_byte v4, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 10ec47c935420..6f6ff8124ffdf 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -140,10 +140,10 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
 ; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
 ; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 14b89416a721a..565ccacbd2638 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -104,17 +104,11 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-NEXT:    s_lshl_b32 s1, s0, 8
-; SI-NEXT:    s_or_b32 s0, s1, s0
-; SI-NEXT:    s_and_b32 s1, s0, 0xff00
-; SI-NEXT:    s_lshr_b32 s4, s0, 8
-; SI-NEXT:    s_or_b32 s1, s4, s1
-; SI-NEXT:    s_lshl_b32 s4, s1, 16
-; SI-NEXT:    s_or_b32 s1, s1, s4
-; SI-NEXT:    s_or_b32 s0, s0, s4
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -125,12 +119,10 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
 ; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_and_b32_e32 v1, 0xffffff00, v0
-; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 bb:
@@ -148,17 +140,11 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-NEXT:    s_lshl_b32 s1, s0, 8
-; SI-NEXT:    s_or_b32 s0, s1, s0
-; SI-NEXT:    s_and_b32 s1, s0, 0xff00
-; SI-NEXT:    s_lshr_b32 s4, s0, 8
-; SI-NEXT:    s_or_b32 s1, s4, s1
-; SI-NEXT:    s_lshl_b32 s4, s1, 16
-; SI-NEXT:    s_or_b32 s1, s1, s4
-; SI-NEXT:    s_or_b32 s0, s0, s4
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index f914c9645df90..ed0fc095ba5d4 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -83,11 +83,11 @@ define i128 @v_shl_i128_vk(i128 %lhs) {
 ; GCN-LABEL: v_shl_i128_vk:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_alignbit_b32 v4, v2, v1, 15
+; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], 17
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 15, v1
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GCN-NEXT:    v_alignbit_b32 v1, v1, v0, 15
-; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 15
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
-; GCN-NEXT:    v_mov_b32_e32 v2, v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i128 %lhs, 17
   ret i128 %shl
@@ -110,11 +110,11 @@ define i128 @v_ashr_i128_vk(i128 %lhs) {
 ; GCN-LABEL: v_ashr_i128_vk:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_ashr_i64 v[4:5], v[2:3], 33
-; GCN-NEXT:    v_alignbit_b32 v0, v2, v1, 1
-; GCN-NEXT:    v_alignbit_b32 v1, v3, v2, 1
-; GCN-NEXT:    v_mov_b32_e32 v2, v4
-; GCN-NEXT:    v_mov_b32_e32 v3, v5
+; GCN-NEXT:    v_mov_b32_e32 v4, v1
+; GCN-NEXT:    v_lshl_b64 v[0:1], v[2:3], 31
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_ashr_i64 v[2:3], v[2:3], 33
+; GCN-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = ashr i128 %lhs, 33
   ret i128 %shl

diff  --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 2ab016efffbbb..4eece54e2e2d3 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -140,10 +140,10 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
 ; GFX6-NEXT:    v_max_i32_e32 v1, 0xffff8000, v1
 ; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 418dfbcb5cd2e..20ece3d1c1a56 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -56,22 +56,19 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
 ; HAWAII-NEXT:    v_mov_b32_e32 v1, s5
 ; HAWAII-NEXT:    flat_load_ubyte v0, v[0:1]
-; HAWAII-NEXT:    s_load_dword s0, s[4:5], 0x3
-; HAWAII-NEXT:    s_load_dword s1, s[4:5], 0x0
-; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x2
+; HAWAII-NEXT:    s_load_dword s0, s[4:5], 0x0
+; HAWAII-NEXT:    s_load_dword s1, s[4:5], 0x2
+; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x3
 ; HAWAII-NEXT:    s_mov_b32 m0, -1
 ; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
-; HAWAII-NEXT:    s_and_b32 s3, s0, 0xffff
-; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
-; HAWAII-NEXT:    v_mov_b32_e32 v2, s0
+; HAWAII-NEXT:    v_mov_b32_e32 v1, s0
+; HAWAII-NEXT:    v_mov_b32_e32 v2, s1
 ; HAWAII-NEXT:    v_mov_b32_e32 v3, s2
-; HAWAII-NEXT:    ds_write_b16 v1, v2 offset:4
+; HAWAII-NEXT:    ds_write_b16 v1, v3 offset:4
 ; HAWAII-NEXT:    s_waitcnt vmcnt(0)
-; HAWAII-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; HAWAII-NEXT:    v_or_b32_e32 v0, s3, v0
-; HAWAII-NEXT:    v_bfe_u32 v0, v0, 16, 7
+; HAWAII-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; HAWAII-NEXT:    ds_write_b8 v1, v0 offset:6
-; HAWAII-NEXT:    ds_write_b32 v1, v3
+; HAWAII-NEXT:    ds_write_b32 v1, v2
 ; HAWAII-NEXT:    s_endpgm
 ;
 ; FIJI-LABEL: local_store_i55:

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index c85e78040cd00..eb2549ac60952 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -144,7 +144,7 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: trunc_v2i64_arg_to_v2i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 9a8b1d09ca88f..37aafd05a7e14 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -116,9 +116,8 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_min_u32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_min_u32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v2i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 18c7f7d7af109..ecf4ded2b4830 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -221,9 +221,9 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i16:

diff  --git a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
index 160646b1a5ba3..cf07cf572523e 100644
--- a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
@@ -90,16 +90,16 @@ define void @i56_or(i56* %a) {
 ;
 ; BE-LABEL: i56_or:
 ; BE:       @ %bb.0:
-; BE-NEXT:    ldr r1, [r0]
-; BE-NEXT:    strb r1, [r0, #3]
-; BE-NEXT:    ldrh r2, [r0, #4]!
-; BE-NEXT:    ldrb r3, [r0, #2]
+; BE-NEXT:    mov r1, r0
+; BE-NEXT:    ldr r0, [r0]
+; BE-NEXT:    ldrh r2, [r1, #4]!
+; BE-NEXT:    ldrb r3, [r1, #2]
 ; BE-NEXT:    orr r2, r3, r2, lsl #8
-; BE-NEXT:    orr r1, r2, r1, lsl #24
-; BE-NEXT:    orr r1, r1, #384
-; BE-NEXT:    strb r1, [r0, #2]
-; BE-NEXT:    lsr r1, r1, #8
-; BE-NEXT:    strh r1, [r0]
+; BE-NEXT:    orr r0, r2, r0, lsl #24
+; BE-NEXT:    orr r0, r0, #384
+; BE-NEXT:    strb r0, [r1, #2]
+; BE-NEXT:    lsr r0, r0, #8
+; BE-NEXT:    strh r0, [r1]
 ; BE-NEXT:    mov pc, lr
   %aa = load i56, i56* %a
   %b = or i56 %aa, 384
@@ -118,16 +118,10 @@ define void @i56_and_or(i56* %a) {
 ;
 ; BE-LABEL: i56_and_or:
 ; BE:       @ %bb.0:
-; BE-NEXT:    ldr r1, [r0]
+; BE-NEXT:    ldrh r1, [r0, #4]!
 ; BE-NEXT:    mov r2, #128
-; BE-NEXT:    strb r1, [r0, #3]
-; BE-NEXT:    ldrh r12, [r0, #4]!
-; BE-NEXT:    ldrb r3, [r0, #2]
+; BE-NEXT:    orr r1, r1, #1
 ; BE-NEXT:    strb r2, [r0, #2]
-; BE-NEXT:    orr r2, r3, r12, lsl #8
-; BE-NEXT:    orr r1, r2, r1, lsl #24
-; BE-NEXT:    orr r1, r1, #384
-; BE-NEXT:    lsr r1, r1, #8
 ; BE-NEXT:    strh r1, [r0]
 ; BE-NEXT:    mov pc, lr
 
@@ -149,13 +143,10 @@ define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
 ;
 ; BE-LABEL: i56_insert_bit:
 ; BE:       @ %bb.0:
-; BE-NEXT:    ldr r2, [r0]
-; BE-NEXT:    strb r2, [r0, #3]
-; BE-NEXT:    ldrh r12, [r0, #4]!
-; BE-NEXT:    ldrb r3, [r0, #2]
-; BE-NEXT:    orr r3, r3, r12, lsl #8
-; BE-NEXT:    orr r2, r3, r2, lsl #24
-; BE-NEXT:    bic r2, r2, #8192
+; BE-NEXT:    ldrh r2, [r0, #4]!
+; BE-NEXT:    mov r3, #57088
+; BE-NEXT:    orr r3, r3, #16711680
+; BE-NEXT:    and r2, r3, r2, lsl #8
 ; BE-NEXT:    orr r1, r2, r1, lsl #13
 ; BE-NEXT:    lsr r1, r1, #8
 ; BE-NEXT:    strh r1, [r0]

diff  --git a/llvm/test/CodeGen/ARM/parity.ll b/llvm/test/CodeGen/ARM/parity.ll
index 40c0d7bd32f11..a077dd0b45f8c 100644
--- a/llvm/test/CodeGen/ARM/parity.ll
+++ b/llvm/test/CodeGen/ARM/parity.ll
@@ -47,8 +47,8 @@ define i17 @parity_17(i17 %x) {
 ; CHECK-LABEL: parity_17:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    bfc r0, #17, #15
-; CHECK-NEXT:    eor r0, r0, r0, lsr #16
-; CHECK-NEXT:    eor r0, r0, r0, lsr #8
+; CHECK-NEXT:    eor r1, r0, r0, lsr #16
+; CHECK-NEXT:    eor r0, r1, r0, lsr #8
 ; CHECK-NEXT:    eor r0, r0, r0, lsr #4
 ; CHECK-NEXT:    eor r0, r0, r0, lsr #2
 ; CHECK-NEXT:    eor r0, r0, r0, lsr #1

diff  --git a/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll b/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll
index 41f2ab49b8dcb..9cc42cf74b7f9 100644
--- a/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll
@@ -84,35 +84,35 @@ define float @fooul(float %X) #0 {
 ; PPC64-NEXT:    addi 3, 5, 0
 ; PPC64-NEXT:  .LBB2_2: # %entry
 ; PPC64-NEXT:    sradi 4, 3, 53
-; PPC64-NEXT:    clrldi 5, 3, 63
+; PPC64-NEXT:    rldicl 5, 3, 63, 1
 ; PPC64-NEXT:    addi 4, 4, 1
+; PPC64-NEXT:    clrldi 6, 3, 63
 ; PPC64-NEXT:    cmpldi 4, 1
-; PPC64-NEXT:    rldicl 4, 3, 63, 1
-; PPC64-NEXT:    or 5, 5, 4
-; PPC64-NEXT:    rldicl 6, 5, 11, 53
-; PPC64-NEXT:    addi 6, 6, 1
-; PPC64-NEXT:    clrldi 7, 5, 53
-; PPC64-NEXT:    cmpldi 1, 6, 1
-; PPC64-NEXT:    clrldi 6, 3, 53
+; PPC64-NEXT:    clrldi 4, 3, 53
+; PPC64-NEXT:    or 6, 6, 5
+; PPC64-NEXT:    clrldi 7, 6, 53
+; PPC64-NEXT:    addi 4, 4, 2047
 ; PPC64-NEXT:    addi 7, 7, 2047
-; PPC64-NEXT:    addi 6, 6, 2047
-; PPC64-NEXT:    or 4, 7, 4
-; PPC64-NEXT:    or 6, 6, 3
-; PPC64-NEXT:    rldicl 4, 4, 53, 11
-; PPC64-NEXT:    rldicr 6, 6, 0, 52
+; PPC64-NEXT:    or 4, 4, 3
+; PPC64-NEXT:    or 5, 7, 5
+; PPC64-NEXT:    rldicl 7, 3, 10, 54
+; PPC64-NEXT:    rldicr 4, 4, 0, 52
+; PPC64-NEXT:    addi 7, 7, 1
 ; PPC64-NEXT:    bc 12, 1, .LBB2_4
 ; PPC64-NEXT:  # %bb.3: # %entry
-; PPC64-NEXT:    ori 6, 3, 0
+; PPC64-NEXT:    ori 4, 3, 0
 ; PPC64-NEXT:    b .LBB2_4
 ; PPC64-NEXT:  .LBB2_4: # %entry
-; PPC64-NEXT:    rldicl 4, 4, 11, 1
-; PPC64-NEXT:    cmpdi 3, 0
-; PPC64-NEXT:    std 6, -32(1)
-; PPC64-NEXT:    bc 12, 5, .LBB2_6
+; PPC64-NEXT:    rldicl 5, 5, 53, 11
+; PPC64-NEXT:    std 4, -32(1)
+; PPC64-NEXT:    rldicl 4, 5, 11, 1
+; PPC64-NEXT:    cmpldi 7, 1
+; PPC64-NEXT:    bc 12, 1, .LBB2_6
 ; PPC64-NEXT:  # %bb.5: # %entry
-; PPC64-NEXT:    ori 4, 5, 0
+; PPC64-NEXT:    ori 4, 6, 0
 ; PPC64-NEXT:    b .LBB2_6
 ; PPC64-NEXT:  .LBB2_6: # %entry
+; PPC64-NEXT:    cmpdi 3, 0
 ; PPC64-NEXT:    std 4, -24(1)
 ; PPC64-NEXT:    bc 12, 0, .LBB2_8
 ; PPC64-NEXT:  # %bb.7: # %entry

diff  --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
index 8003136e1286c..7c9f9b9c80ad6 100644
--- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
@@ -85,7 +85,7 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: test_bswap_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
@@ -491,7 +491,7 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: test_bitreverse_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2

diff  --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index e27947dd0a8be..f2496dddced3b 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -218,7 +218,7 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -285,7 +285,7 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
 ; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    addiw a2, a2, 1365
 ; RV64M-NEXT:    and a1, a1, a2
-; RV64M-NEXT:    subw a0, a0, a1
+; RV64M-NEXT:    sub a0, a0, a1
 ; RV64M-NEXT:    lui a1, 209715
 ; RV64M-NEXT:    addiw a1, a1, 819
 ; RV64M-NEXT:    and a2, a0, a1
@@ -683,7 +683,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -739,7 +739,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
 ; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    addiw a2, a2, 1365
 ; RV64M-NEXT:    and a1, a1, a2
-; RV64M-NEXT:    subw a0, a0, a1
+; RV64M-NEXT:    sub a0, a0, a1
 ; RV64M-NEXT:    lui a1, 209715
 ; RV64M-NEXT:    addiw a1, a1, 819
 ; RV64M-NEXT:    and a2, a0, a1
@@ -1214,7 +1214,7 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -1297,7 +1297,7 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    addiw a2, a2, 1365
 ; RV64M-NEXT:    and a1, a1, a2
-; RV64M-NEXT:    subw a0, a0, a1
+; RV64M-NEXT:    sub a0, a0, a1
 ; RV64M-NEXT:    lui a1, 209715
 ; RV64M-NEXT:    addiw a1, a1, 819
 ; RV64M-NEXT:    and a2, a0, a1
@@ -1805,7 +1805,7 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -1877,7 +1877,7 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    addiw a2, a2, 1365
 ; RV64M-NEXT:    and a1, a1, a2
-; RV64M-NEXT:    subw a0, a0, a1
+; RV64M-NEXT:    sub a0, a0, a1
 ; RV64M-NEXT:    lui a1, 209715
 ; RV64M-NEXT:    addiw a1, a1, 819
 ; RV64M-NEXT:    and a2, a0, a1
@@ -2300,7 +2300,7 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -2350,7 +2350,7 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    addiw a2, a2, 1365
 ; RV64M-NEXT:    and a1, a1, a2
-; RV64M-NEXT:    subw a0, a0, a1
+; RV64M-NEXT:    sub a0, a0, a1
 ; RV64M-NEXT:    lui a1, 209715
 ; RV64M-NEXT:    addiw a1, a1, 819
 ; RV64M-NEXT:    and a2, a0, a1

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbp-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbp-zbkb.ll
index b5a7be45d3858..1bdcdcc862ecf 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb-zbp-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbp-zbkb.ll
@@ -355,11 +355,11 @@ define i64 @roriw_bug(i64 %x) nounwind {
 ; CHECK-LABEL: roriw_bug:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    slli a1, a0, 31
-; CHECK-NEXT:    andi a0, a0, -2
-; CHECK-NEXT:    srli a2, a0, 1
-; CHECK-NEXT:    or a1, a1, a2
-; CHECK-NEXT:    sext.w a1, a1
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    andi a2, a0, -2
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    xor a0, a2, a0
 ; CHECK-NEXT:    ret
   %a = shl i64 %x, 31
   %b = and i64 %x, 18446744073709551614

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 15cf7fcfddbb1..ca3f3f331cc51 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -29,7 +29,7 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -83,7 +83,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -146,7 +146,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -204,7 +204,7 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -273,7 +273,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -380,7 +380,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -423,7 +423,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -465,7 +465,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -520,7 +520,7 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -622,7 +622,7 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -660,7 +660,7 @@ define signext i32 @ctpop_i32_load(i32* %p) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    lui a1, 209715
 ; RV64I-NEXT:    addiw a1, a1, 819
 ; RV64I-NEXT:    and a2, a0, a1
@@ -1028,7 +1028,7 @@ declare i32 @llvm.bswap.i32(i32)
 define signext i32 @bswap_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: bswap_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
@@ -1055,7 +1055,7 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
 define void @bswap_i32_nosext(i32 signext %a, i32* %x) nounwind {
 ; RV64I-LABEL: bswap_i32_nosext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    srli a2, a0, 8
 ; RV64I-NEXT:    lui a3, 16
 ; RV64I-NEXT:    addiw a3, a3, -256
 ; RV64I-NEXT:    and a2, a2, a3

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbp.ll
index 04e1021404306..48c6878f53f1f 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp.ll
@@ -2447,7 +2447,7 @@ declare i32 @llvm.bswap.i32(i32)
 define signext i32 @bswap_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: bswap_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
@@ -2473,7 +2473,7 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
 define void @bswap_i32_nosext(i32 signext %a, i32* %x) nounwind {
 ; RV64I-LABEL: bswap_i32_nosext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    srli a2, a0, 8
 ; RV64I-NEXT:    lui a3, 16
 ; RV64I-NEXT:    addiw a3, a3, -256
 ; RV64I-NEXT:    and a2, a2, a3
@@ -2614,7 +2614,7 @@ declare i32 @llvm.bitreverse.i32(i32)
 define signext i32 @bitreverse_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: bitreverse_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
@@ -2661,7 +2661,7 @@ define signext i32 @bitreverse_i32(i32 signext %a) nounwind {
 define void @bitreverse_i32_nosext(i32 signext %a, i32* %x) nounwind {
 ; RV64I-LABEL: bitreverse_i32_nosext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    srli a2, a0, 8
 ; RV64I-NEXT:    lui a3, 16
 ; RV64I-NEXT:    addiw a3, a3, -256
 ; RV64I-NEXT:    and a2, a2, a3
@@ -2780,7 +2780,7 @@ define i32 @bswap_rotr_i32(i32 %a) {
 ; RV64I-NEXT:    slli a2, a0, 24
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    srliw a2, a0, 24
-; RV64I-NEXT:    srliw a0, a0, 16
+; RV64I-NEXT:    srli a0, a0, 16
 ; RV64I-NEXT:    slli a0, a0, 8
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    slliw a0, a0, 16
@@ -2801,7 +2801,7 @@ define i32 @bswap_rotl_i32(i32 %a) {
 ; RV64I-LABEL: bswap_rotl_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a0, 24
-; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    srli a2, a0, 16
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    slli a2, a0, 8

diff  --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index e248be5093eca..7b382d30dcd64 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -192,7 +192,7 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    and a0, a0, s0
-; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    and a2, a0, s1
 ; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    and a0, a0, s1

diff  --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
index bac5f4c08ed79..49f11597d1ec8 100644
--- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
+++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
@@ -76,38 +76,41 @@ define void @fun2(<8 x i32> %src, <8 x i31>* %p)
 ; CHECK-NEXT:    stmg %r14, %r15, 112(%r15)
 ; CHECK-NEXT:    .cfi_offset %r14, -48
 ; CHECK-NEXT:    .cfi_offset %r15, -40
-; CHECK-NEXT:    vlgvf %r0, %v26, 3
-; CHECK-NEXT:    vlgvf %r4, %v24, 1
-; CHECK-NEXT:    vlgvf %r3, %v24, 2
-; CHECK-NEXT:    srlk %r1, %r0, 8
+; CHECK-NEXT:    vlgvf %r1, %v26, 3
+; CHECK-NEXT:    vlgvf %r0, %v26, 2
+; CHECK-NEXT:    stc %r1, 30(%r2)
+; CHECK-NEXT:    srlk %r3, %r1, 8
+; CHECK-NEXT:    risbgn %r1, %r1, 33, 167, 0
+; CHECK-NEXT:    vlgvf %r5, %v24, 2
+; CHECK-NEXT:    rosbg %r1, %r0, 2, 32, 31
+; CHECK-NEXT:    sth %r3, 28(%r2)
+; CHECK-NEXT:    srlg %r1, %r1, 24
+; CHECK-NEXT:    vlgvf %r3, %v24, 3
+; CHECK-NEXT:    st %r1, 24(%r2)
+; CHECK-NEXT:    vlgvf %r1, %v26, 0
+; CHECK-NEXT:    risbgn %r14, %r5, 6, 164, 27
+; CHECK-NEXT:    sllg %r4, %r3, 60
+; CHECK-NEXT:    rosbg %r14, %r3, 37, 63, 60
+; CHECK-NEXT:    sllg %r3, %r14, 8
+; CHECK-NEXT:    rosbg %r4, %r1, 4, 34, 29
+; CHECK-NEXT:    rosbg %r3, %r4, 56, 63, 8
+; CHECK-NEXT:    stg %r3, 8(%r2)
+; CHECK-NEXT:    vlgvf %r3, %v24, 1
+; CHECK-NEXT:    sllg %r4, %r3, 58
+; CHECK-NEXT:    rosbg %r4, %r5, 6, 36, 27
 ; CHECK-NEXT:    vlgvf %r5, %v24, 0
-; CHECK-NEXT:    sth %r1, 28(%r2)
-; CHECK-NEXT:    sllg %r1, %r4, 58
 ; CHECK-NEXT:    sllg %r5, %r5, 25
-; CHECK-NEXT:    stc %r0, 30(%r2)
-; CHECK-NEXT:    rosbg %r1, %r3, 6, 36, 27
-; CHECK-NEXT:    vlgvf %r3, %v24, 3
-; CHECK-NEXT:    rosbg %r5, %r4, 39, 63, 58
-; CHECK-NEXT:    sllg %r4, %r5, 8
-; CHECK-NEXT:    rosbg %r1, %r3, 37, 63, 60
-; CHECK-NEXT:    vlgvf %r5, %v26, 1
-; CHECK-NEXT:    rosbg %r4, %r1, 56, 63, 8
-; CHECK-NEXT:    stg %r4, 0(%r2)
-; CHECK-NEXT:    vlgvf %r4, %v26, 2
-; CHECK-NEXT:    sllg %r14, %r5, 62
-; CHECK-NEXT:    sllg %r3, %r3, 60
-; CHECK-NEXT:    rosbg %r14, %r4, 2, 32, 31
-; CHECK-NEXT:    rosbg %r14, %r0, 33, 63, 0
-; CHECK-NEXT:    srlg %r0, %r14, 24
-; CHECK-NEXT:    st %r0, 24(%r2)
-; CHECK-NEXT:    vlgvf %r0, %v26, 0
-; CHECK-NEXT:    rosbg %r3, %r0, 4, 34, 29
-; CHECK-NEXT:    sllg %r0, %r1, 8
-; CHECK-NEXT:    rosbg %r3, %r5, 35, 63, 62
-; CHECK-NEXT:    rosbg %r0, %r3, 56, 63, 8
-; CHECK-NEXT:    stg %r0, 8(%r2)
-; CHECK-NEXT:    sllg %r0, %r3, 8
-; CHECK-NEXT:    rosbg %r0, %r14, 56, 63, 8
+; CHECK-NEXT:    rosbg %r5, %r3, 39, 63, 58
+; CHECK-NEXT:    sllg %r3, %r5, 8
+; CHECK-NEXT:    rosbg %r3, %r4, 56, 63, 8
+; CHECK-NEXT:    stg %r3, 0(%r2)
+; CHECK-NEXT:    vlgvf %r3, %v26, 1
+; CHECK-NEXT:    sllg %r4, %r3, 62
+; CHECK-NEXT:    rosbg %r4, %r0, 2, 32, 31
+; CHECK-NEXT:    risbgn %r0, %r1, 4, 162, 29
+; CHECK-NEXT:    rosbg %r0, %r3, 35, 63, 62
+; CHECK-NEXT:    sllg %r0, %r0, 8
+; CHECK-NEXT:    rosbg %r0, %r4, 56, 63, 8
 ; CHECK-NEXT:    stg %r0, 16(%r2)
 ; CHECK-NEXT:    lmg %r14, %r15, 112(%r15)
 ; CHECK-NEXT:    br %r14

diff  --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 04aff9a727c22..c8fb2335874f2 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -399,37 +399,36 @@ define i4 @test_bitreverse_i4(i4 %a) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    andb $8, %al
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    addb %cl, %dl
 ; X86-NEXT:    andb $4, %dl
-; X86-NEXT:    shlb $3, %cl
-; X86-NEXT:    andb $8, %cl
-; X86-NEXT:    orb %dl, %cl
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shrb %dl
-; X86-NEXT:    andb $2, %dl
-; X86-NEXT:    orb %cl, %dl
+; X86-NEXT:    movb %cl, %ah
+; X86-NEXT:    shlb $3, %ah
+; X86-NEXT:    andb $8, %ah
+; X86-NEXT:    orb %dl, %ah
+; X86-NEXT:    shrb %cl
+; X86-NEXT:    andb $2, %cl
+; X86-NEXT:    orb %ah, %cl
 ; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_bitreverse_i4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    leal (%rdi,%rdi), %ecx
-; X64-NEXT:    leal (,%rdi,8), %edx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb $15, %al
+; X64-NEXT:    andb $8, %al
+; X64-NEXT:    leal (%rdi,%rdi), %ecx
 ; X64-NEXT:    andb $4, %cl
+; X64-NEXT:    leal (,%rdi,8), %edx
 ; X64-NEXT:    andb $8, %dl
 ; X64-NEXT:    orb %cl, %dl
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shrb %cl
-; X64-NEXT:    andb $2, %cl
-; X64-NEXT:    orb %dl, %cl
+; X64-NEXT:    shrb %dil
+; X64-NEXT:    andb $2, %dil
+; X64-NEXT:    orb %dil, %dl
 ; X64-NEXT:    shrb $3, %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    orb %dl, %al
 ; X64-NEXT:    retq
 ;
 ; X86XOP-LABEL: test_bitreverse_i4:

diff  --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index 086deb5d89eb2..123195e626997 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -88,16 +88,16 @@ define i8 @test4(i8 %x) nounwind readnone {
 ;
 ; NO-POPCOUNT-LABEL: test4:
 ; NO-POPCOUNT:       # %bb.0:
-; NO-POPCOUNT-NEXT:    andb $127, %dil
-; NO-POPCOUNT-NEXT:    movl %edi, %eax
-; NO-POPCOUNT-NEXT:    shrb %al
-; NO-POPCOUNT-NEXT:    andb $21, %al
-; NO-POPCOUNT-NEXT:    subb %al, %dil
 ; NO-POPCOUNT-NEXT:    movl %edi, %ecx
+; NO-POPCOUNT-NEXT:    andb $127, %cl
+; NO-POPCOUNT-NEXT:    shrb %dil
+; NO-POPCOUNT-NEXT:    andb $21, %dil
+; NO-POPCOUNT-NEXT:    subb %dil, %cl
+; NO-POPCOUNT-NEXT:    movl %ecx, %eax
+; NO-POPCOUNT-NEXT:    andb $51, %al
+; NO-POPCOUNT-NEXT:    shrb $2, %cl
 ; NO-POPCOUNT-NEXT:    andb $51, %cl
-; NO-POPCOUNT-NEXT:    shrb $2, %dil
-; NO-POPCOUNT-NEXT:    andb $51, %dil
-; NO-POPCOUNT-NEXT:    addb %dil, %cl
+; NO-POPCOUNT-NEXT:    addb %al, %cl
 ; NO-POPCOUNT-NEXT:    movl %ecx, %eax
 ; NO-POPCOUNT-NEXT:    shrb $4, %al
 ; NO-POPCOUNT-NEXT:    addb %cl, %al

diff  --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index cb5d3b0ac21c0..649d3f8b48292 100644
--- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -41,7 +41,7 @@ define void @i24_and_or(ptr %a) {
 ; X86-NEXT:    shll $16, %ecx
 ; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    orl $384, %ecx # imm = 0x180
-; X86-NEXT:    andl $16777088, %ecx # imm = 0xFFFF80
+; X86-NEXT:    andl $-128, %ecx
 ; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -53,7 +53,7 @@ define void @i24_and_or(ptr %a) {
 ; X64-NEXT:    shll $16, %ecx
 ; X64-NEXT:    orl %eax, %ecx
 ; X64-NEXT:    orl $384, %ecx # imm = 0x180
-; X64-NEXT:    andl $16777088, %ecx # imm = 0xFFFF80
+; X64-NEXT:    andl $-128, %ecx
 ; X64-NEXT:    movw %cx, (%rdi)
 ; X64-NEXT:    retq
   %b = load i24, ptr %a, align 1
@@ -114,19 +114,13 @@ define void @i56_or(ptr %a) {
 ;
 ; X64-LABEL: i56_or:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl 4(%rdi), %eax
-; X64-NEXT:    movzbl 6(%rdi), %ecx
-; X64-NEXT:    movb %cl, 6(%rdi)
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT:    shll $16, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    shlq $32, %rcx
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    orq $384, %rax # imm = 0x180
-; X64-NEXT:    movl %eax, (%rdi)
-; X64-NEXT:    shrq $32, %rax
-; X64-NEXT:    movw %ax, 4(%rdi)
+; X64-NEXT:    movzbl 6(%rdi), %eax
+; X64-NEXT:    shll $16, %eax
+; X64-NEXT:    movzwl 4(%rdi), %ecx
+; X64-NEXT:    movw %cx, 4(%rdi)
+; X64-NEXT:    shrq $16, %rax
+; X64-NEXT:    movb %al, 6(%rdi)
+; X64-NEXT:    orl $384, (%rdi) # imm = 0x180
 ; X64-NEXT:    retq
   %aa = load i56, ptr %a, align 1
   %b = or i56 %aa, 384
@@ -183,22 +177,21 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
 ;
 ; X64-LABEL: i56_insert_bit:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    movzwl 4(%rdi), %ecx
-; X64-NEXT:    movzbl 6(%rdi), %edx
-; X64-NEXT:    movb %dl, 6(%rdi)
-; X64-NEXT:    # kill: def $edx killed $edx def $rdx
-; X64-NEXT:    shll $16, %edx
-; X64-NEXT:    orl %ecx, %edx
-; X64-NEXT:    shlq $32, %rdx
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    orq %rdx, %rcx
-; X64-NEXT:    shlq $13, %rax
-; X64-NEXT:    andq $-8193, %rcx # imm = 0xDFFF
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    movl %ecx, (%rdi)
-; X64-NEXT:    shrq $32, %rcx
-; X64-NEXT:    movw %cx, 4(%rdi)
+; X64-NEXT:    movzwl 4(%rdi), %eax
+; X64-NEXT:    movzbl 6(%rdi), %ecx
+; X64-NEXT:    movb %cl, 6(%rdi)
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    shll $16, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    shlq $32, %rcx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    shll $13, %esi
+; X64-NEXT:    andq $-8193, %rax # imm = 0xDFFF
+; X64-NEXT:    orl %eax, %esi
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    movw %ax, 4(%rdi)
+; X64-NEXT:    movl %esi, (%rdi)
 ; X64-NEXT:    retq
   %extbit = zext i1 %bit to i56
   %b = load i56, ptr %a, align 1

diff  --git a/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll b/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll
index 48a287e31a69c..64c79b1eba87e 100644
--- a/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll
+++ b/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll
@@ -1,12 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-- -mattr=-bmi | FileCheck %s
 
+; TODO: This might not be testing the original issue anymore? Should the movl still be removed?
 define fastcc i32 @t() nounwind  {
 ; CHECK-LABEL: t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movzwl 0, %eax
-; CHECK-NEXT:    orl $2, %eax
-; CHECK-NEXT:    movw %ax, 0
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    orl $2, %ecx
+; CHECK-NEXT:    movw %cx, 0
 ; CHECK-NEXT:    shrl $3, %eax
 ; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/load-local-v4i5.ll b/llvm/test/CodeGen/X86/load-local-v4i5.ll
index 34100bdbc2c4e..1d119b1dfefc2 100644
--- a/llvm/test/CodeGen/X86/load-local-v4i5.ll
+++ b/llvm/test/CodeGen/X86/load-local-v4i5.ll
@@ -11,6 +11,9 @@ define void @_start() {
 ; CHECK-NEXT:    movzbl -9(%rsp), %ecx
 ; CHECK-NEXT:    movzbl -10(%rsp), %edx
 ; CHECK-NEXT:    movzbl -11(%rsp), %esi
+; CHECK-NEXT:    movzbl %cl, %edi
+; CHECK-NEXT:    shrb %cl
+; CHECK-NEXT:    movb %cl, -2(%rsp)
 ; CHECK-NEXT:    andl $31, %eax
 ; CHECK-NEXT:    andl $31, %esi
 ; CHECK-NEXT:    shll $5, %esi
@@ -18,16 +21,12 @@ define void @_start() {
 ; CHECK-NEXT:    andl $31, %edx
 ; CHECK-NEXT:    shll $10, %edx
 ; CHECK-NEXT:    orl %esi, %edx
-; CHECK-NEXT:    movzbl %cl, %eax
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shll $15, %ecx
-; CHECK-NEXT:    orl %edx, %ecx
-; CHECK-NEXT:    movw %cx, -4(%rsp)
-; CHECK-NEXT:    shrl $16, %ecx
-; CHECK-NEXT:    andl $15, %ecx
-; CHECK-NEXT:    movb %cl, -2(%rsp)
-; CHECK-NEXT:    movb %al, -5(%rsp)
-; CHECK-NEXT:    cmpb $31, %al
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shll $15, %eax
+; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    movw %ax, -4(%rsp)
+; CHECK-NEXT:    movb %dil, -5(%rsp)
+; CHECK-NEXT:    cmpb $31, %dil
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %Then
 ; CHECK-NEXT:    int3

diff  --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll
index 2a7085b9ce604..65491586558f4 100644
--- a/llvm/test/CodeGen/X86/masked_compressstore.ll
+++ b/llvm/test/CodeGen/X86/masked_compressstore.ll
@@ -517,21 +517,20 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
 ; AVX512F-NEXT:    vpslld $31, %zmm2, %zmm2
 ; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    kmovw %k1, %eax
-; AVX512F-NEXT:    movzbl %al, %eax
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl %ecx
-; AVX512F-NEXT:    andl $-43, %ecx
-; AVX512F-NEXT:    subl %ecx, %eax
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
-; AVX512F-NEXT:    shrl $2, %eax
+; AVX512F-NEXT:    movzbl %al, %ecx
+; AVX512F-NEXT:    shrl %eax
+; AVX512F-NEXT:    andl $85, %eax
+; AVX512F-NEXT:    subl %eax, %ecx
+; AVX512F-NEXT:    movl %ecx, %eax
 ; AVX512F-NEXT:    andl $858993459, %eax ## imm = 0x33333333
-; AVX512F-NEXT:    addl %ecx, %eax
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $4, %ecx
+; AVX512F-NEXT:    shrl $2, %ecx
+; AVX512F-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
 ; AVX512F-NEXT:    addl %eax, %ecx
-; AVX512F-NEXT:    andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512F-NEXT:    imull $16843009, %ecx, %eax ## imm = 0x1010101
+; AVX512F-NEXT:    movl %ecx, %eax
+; AVX512F-NEXT:    shrl $4, %eax
+; AVX512F-NEXT:    addl %ecx, %eax
+; AVX512F-NEXT:    andl $252645135, %eax ## imm = 0xF0F0F0F
+; AVX512F-NEXT:    imull $16843009, %eax, %eax ## imm = 0x1010101
 ; AVX512F-NEXT:    shrl $24, %eax
 ; AVX512F-NEXT:    kshiftrw $8, %k1, %k2
 ; AVX512F-NEXT:    vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
@@ -571,21 +570,20 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
 ; AVX512VLBW-NEXT:    vpsllw $7, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovb2m %xmm2, %k1
 ; AVX512VLBW-NEXT:    kmovd %k1, %eax
-; AVX512VLBW-NEXT:    movzbl %al, %eax
-; AVX512VLBW-NEXT:    movl %eax, %ecx
-; AVX512VLBW-NEXT:    shrl %ecx
-; AVX512VLBW-NEXT:    andl $-43, %ecx
-; AVX512VLBW-NEXT:    subl %ecx, %eax
-; AVX512VLBW-NEXT:    movl %eax, %ecx
-; AVX512VLBW-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
-; AVX512VLBW-NEXT:    shrl $2, %eax
+; AVX512VLBW-NEXT:    movzbl %al, %ecx
+; AVX512VLBW-NEXT:    shrl %eax
+; AVX512VLBW-NEXT:    andl $85, %eax
+; AVX512VLBW-NEXT:    subl %eax, %ecx
+; AVX512VLBW-NEXT:    movl %ecx, %eax
 ; AVX512VLBW-NEXT:    andl $858993459, %eax ## imm = 0x33333333
-; AVX512VLBW-NEXT:    addl %ecx, %eax
-; AVX512VLBW-NEXT:    movl %eax, %ecx
-; AVX512VLBW-NEXT:    shrl $4, %ecx
+; AVX512VLBW-NEXT:    shrl $2, %ecx
+; AVX512VLBW-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
 ; AVX512VLBW-NEXT:    addl %eax, %ecx
-; AVX512VLBW-NEXT:    andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512VLBW-NEXT:    imull $16843009, %ecx, %eax ## imm = 0x1010101
+; AVX512VLBW-NEXT:    movl %ecx, %eax
+; AVX512VLBW-NEXT:    shrl $4, %eax
+; AVX512VLBW-NEXT:    addl %ecx, %eax
+; AVX512VLBW-NEXT:    andl $252645135, %eax ## imm = 0xF0F0F0F
+; AVX512VLBW-NEXT:    imull $16843009, %eax, %eax ## imm = 0x1010101
 ; AVX512VLBW-NEXT:    shrl $24, %eax
 ; AVX512VLBW-NEXT:    kshiftrw $8, %k1, %k2
 ; AVX512VLBW-NEXT:    vcompresspd %zmm1, (%rdi,%rax,8) {%k2}

diff  --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
index 0d4074df399f5..563056544e692 100644
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -57,15 +57,15 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) {
 ; CHECK-NEXT:    movl $32768, %ecx # imm = 0x8000
 ; CHECK-NEXT:    cmovll %ecx, %edx
 ; CHECK-NEXT:    pextrw $1, %xmm0, %esi
-; CHECK-NEXT:    movswl %si, %edi
-; CHECK-NEXT:    leal (%rdi,%rdi), %eax
+; CHECK-NEXT:    leal (%rsi,%rsi), %edi
+; CHECK-NEXT:    movswl %si, %eax
 ; CHECK-NEXT:    movl %eax, %esi
 ; CHECK-NEXT:    shrl $16, %esi
-; CHECK-NEXT:    shldw $1, %ax, %si
-; CHECK-NEXT:    sarl $15, %edi
-; CHECK-NEXT:    cmpl $16384, %edi # imm = 0x4000
+; CHECK-NEXT:    shldw $1, %di, %si
+; CHECK-NEXT:    sarl $16, %eax
+; CHECK-NEXT:    cmpl $16384, %eax # imm = 0x4000
 ; CHECK-NEXT:    cmovgel %r8d, %esi
-; CHECK-NEXT:    cmpl $-16384, %edi # imm = 0xC000
+; CHECK-NEXT:    cmpl $-16384, %eax # imm = 0xC000
 ; CHECK-NEXT:    cmovll %ecx, %esi
 ; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    cwtl


        


More information about the llvm-commits mailing list