[llvm] 4383079 - [AMDGPU] Remove dubious logic in bidirectional list scheduler

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 28 13:43:25 PST 2020


Author: Jay Foad
Date: 2020-02-28T21:35:34Z
New Revision: 43830790d76baf8a2488ae7c8179ec347ce43e8c

URL: https://github.com/llvm/llvm-project/commit/43830790d76baf8a2488ae7c8179ec347ce43e8c
DIFF: https://github.com/llvm/llvm-project/commit/43830790d76baf8a2488ae7c8179ec347ce43e8c.diff

LOG: [AMDGPU] Remove dubious logic in bidirectional list scheduler

Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.

I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.

To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.

Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.

Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB

Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D68338

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
    llvm/test/CodeGen/AMDGPU/add3.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/bitreverse.ll
    llvm/test/CodeGen/AMDGPU/bypass-div.ll
    llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
    llvm/test/CodeGen/AMDGPU/ctlz.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
    llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
    llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
    llvm/test/CodeGen/AMDGPU/fpow.ll
    llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    llvm/test/CodeGen/AMDGPU/idot2.ll
    llvm/test/CodeGen/AMDGPU/idot4s.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
    llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/test/CodeGen/AMDGPU/load-lo16.ll
    llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
    llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
    llvm/test/CodeGen/AMDGPU/max.i16.ll
    llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
    llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
    llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/shl.ll
    llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
    llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
    llvm/test/CodeGen/AMDGPU/sign_extend.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
    llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
    llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
    llvm/test/CodeGen/AMDGPU/xor3.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 71ace87ed145..deed50b6db7d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -233,33 +233,11 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   // Pick best from BotCand and TopCand.
   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
-  SchedCandidate Cand;
-  if (TopCand.Reason == BotCand.Reason) {
-    Cand = BotCand;
-    GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
-    TopCand.Reason = NoCand;
-    GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
-    if (TopCand.Reason != NoCand) {
-      Cand.setBest(TopCand);
-    } else {
-      TopCand.Reason = TopReason;
-    }
-  } else {
-    if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
-      Cand = TopCand;
-    } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
-      Cand = BotCand;
-    } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
-      Cand = TopCand;
-    } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
-      Cand = BotCand;
-    } else {
-      if (BotCand.Reason > TopCand.Reason) {
-        Cand = TopCand;
-      } else {
-        Cand = BotCand;
-      }
-    }
+  SchedCandidate Cand = BotCand;
+  TopCand.Reason = NoCand;
+  GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
+  if (TopCand.Reason != NoCand) {
+    Cand.setBest(TopCand);
   }
   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index b14b984ff9ac..f446f35b0e0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -374,10 +374,10 @@ define i16 @v_bswap_i16(i16 %src) {
 ; GFX7-LABEL: v_bswap_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_i16:
@@ -440,10 +440,10 @@ define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
 ; GFX7-LABEL: v_bswap_i16_zext_to_i32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 0, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -469,10 +469,10 @@ define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
 ; GFX7-LABEL: v_bswap_i16_sext_to_i32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 499b38f03e96..ad8f1d17b8dd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -126,21 +126,21 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-LABEL: v_pow_v2f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX8-NEXT:    v_log_f16_e32 v2, v0
+; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 16
-; GFX8-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX8-NEXT:    v_exp_f16_e32 v0, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_exp_f16_e32 v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16:
@@ -154,11 +154,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    v_exp_f16_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX9-NEXT:    v_exp_f16_e32 v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
@@ -173,40 +173,40 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_log_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
-; GFX6-NEXT:    v_log_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v3
+; GFX6-NEXT:    v_exp_f32_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX8-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX8-NEXT:    v_log_f16_e32 v2, v0
+; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 16
-; GFX8-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX8-NEXT:    v_exp_f16_e32 v0, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_exp_f16_e32 v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
@@ -259,22 +259,22 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_log_f16_e32 v2, v0
+; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    v_exp_f16_e32 v1, v2
+; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX8-NEXT:    v_exp_f16_e32 v0, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_exp_f16_e32 v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
@@ -336,22 +336,22 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x80008000
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX8-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_log_f16_e32 v2, v0
+; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX8-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    v_exp_f16_e32 v1, v2
+; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v2, v3
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX8-NEXT:    v_exp_f16_e32 v0, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_exp_f16_e32 v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 5243076ea8b0..a0aa9a3bfe55 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -233,34 +233,41 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_v(<8 x float> inreg %v
 ; MOVREL-NEXT:    s_mov_b32 s4, s6
 ; MOVREL-NEXT:    s_mov_b32 s6, s8
 ; MOVREL-NEXT:    v_mov_b32_e32 v16, s7
-; MOVREL-NEXT:    v_mov_b32_e32 v8, v0
 ; MOVREL-NEXT:    v_mov_b32_e32 v14, s5
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v13, s4
 ; MOVREL-NEXT:    v_mov_b32_e32 v15, s6
-; MOVREL-NEXT:    v_mov_b32_e32 v12, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v11, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v10, s1
 ; MOVREL-NEXT:    v_mov_b32_e32 v9, s0
 ; MOVREL-NEXT:    s_mov_b32 s0, exec_lo
 ; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:  BB3_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s1, v8
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v9
-; MOVREL-NEXT:    v_mov_b32_e32 v1, v10
-; MOVREL-NEXT:    v_mov_b32_e32 v2, v11
-; MOVREL-NEXT:    v_mov_b32_e32 v3, v12
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v8
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v0
+; MOVREL-NEXT:    v_mov_b32_e32 v1, v9
+; MOVREL-NEXT:    v_mov_b32_e32 v2, v10
+; MOVREL-NEXT:    v_mov_b32_e32 v3, v11
+; MOVREL-NEXT:    v_mov_b32_e32 v4, v12
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v0
 ; MOVREL-NEXT:    s_mov_b32 m0, s1
-; MOVREL-NEXT:    v_mov_b32_e32 v4, v13
-; MOVREL-NEXT:    v_mov_b32_e32 v5, v14
-; MOVREL-NEXT:    v_mov_b32_e32 v6, v15
-; MOVREL-NEXT:    v_mov_b32_e32 v7, v16
-; MOVREL-NEXT:    v_movreld_b32_e32 v0, s10
+; MOVREL-NEXT:    v_mov_b32_e32 v5, v13
+; MOVREL-NEXT:    v_mov_b32_e32 v6, v14
+; MOVREL-NEXT:    v_mov_b32_e32 v7, v15
+; MOVREL-NEXT:    v_mov_b32_e32 v8, v16
+; MOVREL-NEXT:    v_movreld_b32_e32 v1, s10
 ; MOVREL-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
 ; MOVREL-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
 ; MOVREL-NEXT:    s_cbranch_execnz BB3_1
 ; MOVREL-NEXT:  ; %bb.2:
 ; MOVREL-NEXT:    s_mov_b32 exec_lo, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
+; MOVREL-NEXT:    v_mov_b32_e32 v1, v2
+; MOVREL-NEXT:    v_mov_b32_e32 v2, v3
+; MOVREL-NEXT:    v_mov_b32_e32 v3, v4
+; MOVREL-NEXT:    v_mov_b32_e32 v4, v5
+; MOVREL-NEXT:    v_mov_b32_e32 v5, v6
+; MOVREL-NEXT:    v_mov_b32_e32 v6, v7
+; MOVREL-NEXT:    v_mov_b32_e32 v7, v8
 ; MOVREL-NEXT:    ; return to shader part epilog
 entry:
   %insert = insertelement <8 x float> %vec, float %val, i32 %idx
@@ -393,35 +400,41 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_v(<8 x float> inreg %v
 ; MOVREL-NEXT:    s_mov_b32 s4, s6
 ; MOVREL-NEXT:    s_mov_b32 s6, s8
 ; MOVREL-NEXT:    v_mov_b32_e32 v17, s7
-; MOVREL-NEXT:    v_mov_b32_e32 v8, v0
-; MOVREL-NEXT:    v_mov_b32_e32 v9, v1
 ; MOVREL-NEXT:    v_mov_b32_e32 v15, s5
-; MOVREL-NEXT:    v_mov_b32_e32 v16, s6
-; MOVREL-NEXT:    v_mov_b32_e32 v14, s4
 ; MOVREL-NEXT:    v_mov_b32_e32 v13, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v14, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v16, s6
 ; MOVREL-NEXT:    v_mov_b32_e32 v12, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v11, s1
 ; MOVREL-NEXT:    v_mov_b32_e32 v10, s0
 ; MOVREL-NEXT:    s_mov_b32 s0, exec_lo
 ; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s1, v9
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v10
-; MOVREL-NEXT:    v_mov_b32_e32 v1, v11
-; MOVREL-NEXT:    v_mov_b32_e32 v2, v12
-; MOVREL-NEXT:    v_mov_b32_e32 v3, v13
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v9
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v1
+; MOVREL-NEXT:    v_mov_b32_e32 v2, v10
+; MOVREL-NEXT:    v_mov_b32_e32 v3, v11
+; MOVREL-NEXT:    v_mov_b32_e32 v4, v12
+; MOVREL-NEXT:    v_mov_b32_e32 v5, v13
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; MOVREL-NEXT:    s_mov_b32 m0, s1
-; MOVREL-NEXT:    v_mov_b32_e32 v4, v14
-; MOVREL-NEXT:    v_mov_b32_e32 v5, v15
-; MOVREL-NEXT:    v_mov_b32_e32 v6, v16
-; MOVREL-NEXT:    v_mov_b32_e32 v7, v17
-; MOVREL-NEXT:    v_movreld_b32_e32 v0, v8
+; MOVREL-NEXT:    v_mov_b32_e32 v6, v14
+; MOVREL-NEXT:    v_mov_b32_e32 v7, v15
+; MOVREL-NEXT:    v_mov_b32_e32 v8, v16
+; MOVREL-NEXT:    v_mov_b32_e32 v9, v17
+; MOVREL-NEXT:    v_movreld_b32_e32 v2, v0
 ; MOVREL-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
 ; MOVREL-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
 ; MOVREL-NEXT:    s_cbranch_execnz BB6_1
 ; MOVREL-NEXT:  ; %bb.2:
 ; MOVREL-NEXT:    s_mov_b32 exec_lo, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v2
+; MOVREL-NEXT:    v_mov_b32_e32 v1, v3
+; MOVREL-NEXT:    v_mov_b32_e32 v2, v4
+; MOVREL-NEXT:    v_mov_b32_e32 v3, v5
+; MOVREL-NEXT:    v_mov_b32_e32 v4, v6
+; MOVREL-NEXT:    v_mov_b32_e32 v5, v7
+; MOVREL-NEXT:    v_mov_b32_e32 v6, v8
+; MOVREL-NEXT:    v_mov_b32_e32 v7, v9
 ; MOVREL-NEXT:    ; return to shader part epilog
 entry:
   %insert = insertelement <8 x float> %vec, float %val, i32 %idx

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index 3bbfed7781ab..02039a39e4b9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -10,9 +10,9 @@ define amdgpu_kernel void @use_lds_globals(i32 addrspace(1)* %out, i32 addrspace
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CHECK-NEXT:    s_add_u32 s2, 4, 4
-; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:    s_mov_b32 m0, -1
-; CHECK-NEXT:    ds_read_b32 v2, v2
+; CHECK-NEXT:    ds_read_b32 v2, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_add_u32 s0, s0, 4
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
index 3cad9f11bdba..527a66c48142 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
@@ -371,21 +371,21 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v2, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v2
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT:    flat_atomic_dec v4, v[2:3], v4 glc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v4
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v4, 42
+; CI-NEXT:    flat_atomic_dec v2, v[2:3], v4 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    flat_store_dword v[0:1], v4
+; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64:
@@ -393,21 +393,21 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v2
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_atomic_dec v4, v[2:3], v4 glc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, 42
+; VI-NEXT:    flat_atomic_dec v2, v[2:3], v4 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
@@ -448,7 +448,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
@@ -456,7 +455,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64:
@@ -464,7 +464,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -472,7 +471,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
@@ -678,21 +678,21 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v2, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v2
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT:    flat_atomic_dec v4, v[2:3], v4 glc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v4
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v4, 42
+; CI-NEXT:    flat_atomic_dec v2, v[2:3], v4 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    flat_store_dword v[0:1], v4
+; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
@@ -700,21 +700,21 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v2
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_atomic_dec v4, v[2:3], v4 glc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, 42
+; VI-NEXT:    flat_atomic_dec v2, v[2:3], v4 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
@@ -755,7 +755,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
@@ -763,7 +762,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
@@ -771,7 +771,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -779,7 +778,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
@@ -997,20 +997,20 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v5, s3
-; CI-NEXT:    v_mov_b32_e32 v4, s2
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
-; CI-NEXT:    v_mov_b32_e32 v5, s1
-; CI-NEXT:    v_mov_b32_e32 v4, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v4
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v4, 42
+; CI-NEXT:    v_mov_b32_e32 v5, 0
+; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
@@ -1020,20 +1020,20 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, 42
+; VI-NEXT:    v_mov_b32_e32 v5, 0
+; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -1077,15 +1077,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v5, s1
-; CI-NEXT:    v_mov_b32_e32 v4, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; CI-NEXT:    s_endpgm
 ;
@@ -1094,15 +1094,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
@@ -1566,20 +1566,20 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v5, s3
-; CI-NEXT:    v_mov_b32_e32 v4, s2
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
-; CI-NEXT:    v_mov_b32_e32 v5, s1
-; CI-NEXT:    v_mov_b32_e32 v4, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v4
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v4, 42
+; CI-NEXT:    v_mov_b32_e32 v5, 0
+; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
@@ -1589,20 +1589,20 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, 42
+; VI-NEXT:    v_mov_b32_e32 v5, 0
+; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -1646,15 +1646,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v5, s1
-; CI-NEXT:    v_mov_b32_e32 v4, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; CI-NEXT:    s_endpgm
 ;
@@ -1663,15 +1663,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_endpgm
 ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index f4b01506ad45..de462105dc48 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -373,21 +373,21 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v2, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v2
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT:    flat_atomic_inc v4, v[2:3], v4 glc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v4
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v4, 42
+; CI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    flat_store_dword v[0:1], v4
+; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64:
@@ -395,21 +395,21 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v2
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_atomic_inc v4, v[2:3], v4 glc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, 42
+; VI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64:
@@ -417,21 +417,21 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 20, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_atomic_inc v4, v[2:3], v4, off glc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 20, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 42
+; GFX9-NEXT:    global_atomic_inc v2, v[2:3], v4, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[0:1], v4, off
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
@@ -448,7 +448,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
@@ -456,7 +455,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_atomic_inc v0, v[0:1], v4 glc
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64:
@@ -464,7 +464,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -472,7 +471,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_atomic_inc v0, v[0:1], v4 glc
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64:
@@ -480,7 +480,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
@@ -488,7 +487,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 20, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v4, off glc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v2, off glc
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
@@ -939,20 +939,20 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v5, s3
-; CI-NEXT:    v_mov_b32_e32 v4, s2
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
-; CI-NEXT:    v_mov_b32_e32 v5, s1
-; CI-NEXT:    v_mov_b32_e32 v4, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v4
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v4, 42
+; CI-NEXT:    v_mov_b32_e32 v5, 0
+; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
@@ -962,20 +962,20 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, 42
+; VI-NEXT:    v_mov_b32_e32 v5, 0
+; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -985,20 +985,20 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 40, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT:    global_atomic_inc_x2 v[2:3], v[4:5], v[2:3], off glc
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 40, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 42
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    global_atomic_inc_x2 v[2:3], v[2:3], v[4:5], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_endpgm
@@ -1017,15 +1017,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v5, s1
-; CI-NEXT:    v_mov_b32_e32 v4, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; CI-NEXT:    s_endpgm
 ;
@@ -1034,15 +1034,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_endpgm
 ;
@@ -1051,15 +1051,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 40, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1147,21 +1147,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_add_i32_e32 v2, vcc, v2, v0
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v2
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT:    flat_atomic_inc v4, v[2:3], v4 glc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 20, v4
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v4, 42
+; CI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    flat_store_dword v[0:1], v4
+; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
@@ -1169,21 +1169,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v2
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_atomic_inc v4, v[2:3], v4 glc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, 42
+; VI-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
@@ -1191,21 +1191,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 20, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    flat_atomic_inc v4, v[2:3], v4 glc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 20, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 42
+; GFX9-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_store_dword v[0:1], v4
+; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32* %ptr, i32 %id
@@ -1222,7 +1222,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT:    v_mov_b32_e32 v4, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
@@ -1230,7 +1229,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_atomic_inc v0, v[0:1], v4 glc
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
@@ -1238,7 +1238,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v4, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -1246,7 +1245,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_atomic_inc v0, v[0:1], v4 glc
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
@@ -1254,7 +1254,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
@@ -1262,7 +1261,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 20, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v4 glc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32* %ptr, i32 %id
@@ -1420,20 +1420,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v5, s3
-; CI-NEXT:    v_mov_b32_e32 v4, s2
-; CI-NEXT:    v_add_i32_e32 v4, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; CI-NEXT:    v_add_i32_e32 v4, vcc, 40, v4
-; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
-; CI-NEXT:    v_mov_b32_e32 v5, s1
-; CI-NEXT:    v_mov_b32_e32 v4, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v4
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT:    v_mov_b32_e32 v4, 42
+; CI-NEXT:    v_mov_b32_e32 v5, 0
+; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
@@ -1443,20 +1443,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 40, v4
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, 42
+; VI-NEXT:    v_mov_b32_e32 v5, 0
+; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -1466,20 +1466,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 40, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT:    flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 40, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 42
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX9-NEXT:    s_endpgm
@@ -1498,15 +1498,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v5, s1
-; CI-NEXT:    v_mov_b32_e32 v4, s0
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 40, v0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; CI-NEXT:    s_endpgm
 ;
@@ -1515,15 +1515,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 40, v0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_endpgm
 ;
@@ -1532,15 +1532,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 40, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index 32a25c6e8d0a..f3ed7e219ab8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -20,13 +20,13 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
 ;
 ; GFX10-LABEL: dpp_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa]
-; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x2c ; encoding: [0x00,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa]
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa]
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0 ; encoding: [0x00,0x02,0x04,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2 ; encoding: [0x02,0x02,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1 ; encoding: [0x01,0x02,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x04,0x7e,0x02,0x01,0x08,0x11]
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x00,0x02,0x7d,0x00]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index 4f9fb6d5d592..fdcf0f1515f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -136,19 +136,19 @@ define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
 ; GFX906-LABEL: v_sdot2_inline_literal_a_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 8, 8
-; GFX906-NEXT:    v_mov_b32_e32 v0, s4
-; GFX906-NEXT:    v_dot2_i32_i16 v0, s5, v0, v1
+; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
+; GFX906-NEXT:    v_mov_b32_e32 v0, s5
+; GFX906-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_a_b:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 8, 8
-; GFX908-NEXT:    v_mov_b32_e32 v0, s4
-; GFX908-NEXT:    v_dot2_i32_i16 v0, s5, v0, v1
+; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
+; GFX908-NEXT:    v_mov_b32_e32 v0, s5
+; GFX908-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_a_b:
@@ -168,19 +168,19 @@ define i32 @v_sdot2_inline_literal_a_b_c() {
 ; GFX906-LABEL: v_sdot2_inline_literal_a_b_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 8, 8
-; GFX906-NEXT:    v_mov_b32_e32 v0, s4
-; GFX906-NEXT:    v_dot2_i32_i16 v0, s5, v0, 8
+; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
+; GFX906-NEXT:    v_mov_b32_e32 v0, s5
+; GFX906-NEXT:    v_dot2_i32_i16 v0, s4, v0, 8
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_a_b_c:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 8, 8
-; GFX908-NEXT:    v_mov_b32_e32 v0, s4
-; GFX908-NEXT:    v_dot2_i32_i16 v0, s5, v0, 8
+; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
+; GFX908-NEXT:    v_mov_b32_e32 v0, s5
+; GFX908-NEXT:    v_dot2_i32_i16 v0, s4, v0, 8
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_a_b_c:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
index a6a85bbb5c7e..d285ee132cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -136,19 +136,19 @@ define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
 ; GFX906-LABEL: v_udot2_inline_literal_a_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 8, 8
-; GFX906-NEXT:    v_mov_b32_e32 v0, s4
-; GFX906-NEXT:    v_dot2_u32_u16 v0, s5, v0, v1
+; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
+; GFX906-NEXT:    v_mov_b32_e32 v0, s5
+; GFX906-NEXT:    v_dot2_u32_u16 v0, s4, v0, v1
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_a_b:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 8, 8
-; GFX908-NEXT:    v_mov_b32_e32 v0, s4
-; GFX908-NEXT:    v_dot2_u32_u16 v0, s5, v0, v1
+; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
+; GFX908-NEXT:    v_mov_b32_e32 v0, s5
+; GFX908-NEXT:    v_dot2_u32_u16 v0, s4, v0, v1
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_a_b:
@@ -168,19 +168,19 @@ define i32 @v_udot2_inline_literal_a_b_c() {
 ; GFX906-LABEL: v_udot2_inline_literal_a_b_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 8, 8
-; GFX906-NEXT:    v_mov_b32_e32 v0, s4
-; GFX906-NEXT:    v_dot2_u32_u16 v0, s5, v0, 8
+; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
+; GFX906-NEXT:    v_mov_b32_e32 v0, s5
+; GFX906-NEXT:    v_dot2_u32_u16 v0, s4, v0, 8
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_a_b_c:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 8, 8
-; GFX908-NEXT:    v_mov_b32_e32 v0, s4
-; GFX908-NEXT:    v_dot2_u32_u16 v0, s5, v0, 8
+; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
+; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
+; GFX908-NEXT:    v_mov_b32_e32 v0, s5
+; GFX908-NEXT:    v_dot2_u32_u16 v0, s4, v0, 8
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_a_b_c:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 78590eac8280..751f3b703024 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -8,26 +8,26 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_nop 1
+; GFX8-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: dpp_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s3
+; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index b10af3b7a332..d753fd99e0ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -891,24 +891,24 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspa
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX6-NEXT:    s_add_u32 s0, s2, 0x3ffc
+; GFX6-NEXT:    s_add_u32 s4, s2, 0x3ffc
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; GFX6-NEXT:    s_addc_u32 s1, s3, 0
-; GFX6-NEXT:    s_mov_b32 s2, 0
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_addc_u32 s5, s3, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    s_add_u32 s0, s2, 0x3ffc
+; GFX7-NEXT:    s_add_u32 s4, s2, 0x3ffc
 ; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
-; GFX7-NEXT:    s_addc_u32 s1, s3, 0
-; GFX7-NEXT:    s_mov_b32 s2, 0
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_addc_u32 s5, s3, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095
@@ -1067,32 +1067,34 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(i32 addrspace(1)* i
 ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX6-NEXT:    v_lshl_b64 v[1:2], v[0:1], 2
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, 2
+; GFX6-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT:    v_lshl_b64 v[1:2], v[0:1], 2
+; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
-; GFX7-NEXT:    v_mov_b32_e32 v0, 2
+; GFX7-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 %voffset
   %result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst

diff  --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll
index bd430be7172f..a5da9706749d 100644
--- a/llvm/test/CodeGen/AMDGPU/add3.ll
+++ b/llvm/test/CodeGen/AMDGPU/add3.ll
@@ -245,12 +245,12 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float
 ;
 ; GFX10-LABEL: add3_uniform_vgpr:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_f32_e64 v0, s2, 1.0
 ; GFX10-NEXT:    v_add_f32_e64 v1, s3, 2.0
-; GFX10-NEXT:    v_add_f32_e64 v2, s2, 1.0
-; GFX10-NEXT:    v_add_f32_e64 v0, 0x40400000, s4
+; GFX10-NEXT:    v_add_f32_e64 v2, 0x40400000, s4
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v1, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    ; return to shader part epilog
   %a1 = fadd float %a, 1.0
   %b2 = fadd float %b, 2.0

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 1b6a1a7427d6..8b1ec0b013a6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-codegenprepare %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
 
@@ -290,11 +290,11 @@ define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x392fa
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    s_mov_b32 s5, 0x30c30c31
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT:    v_mul_hi_i32 v0, v0, s5
+; GCN-NEXT:    s_mov_b32 s4, 0x30c30c31
+; GCN-NEXT:    v_mul_hi_i32 v0, v0, s4
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -320,11 +320,11 @@ define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0xa410
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    s_mov_b32 s5, 0x30c30c31
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT:    v_mul_hi_i32 v0, v0, s5
+; GCN-NEXT:    s_mov_b32 s4, 0x30c30c31
+; GCN-NEXT:    v_mul_hi_i32 v0, v0, s4
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -404,7 +404,6 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
 ; IR-LABEL: @select_mul_rhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
 ; IR-NEXT:    ret i32 [[OP]]
-;
   %select = select i1 %cond, i32 5, i32 8
   %op = mul i32 %select, 1000
   ret i32 %op
@@ -415,7 +414,6 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131
 ; IR-NEXT:    store i16 [[OP]], i16 addrspace(1)* undef
 ; IR-NEXT:    ret void
-
 ; GCN-LABEL: select_add_lhs_const_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
@@ -427,7 +425,6 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    flat_store_short v[0:1], v0
 ; GCN-NEXT:    s_endpgm
-;
   %select = select i1 %cond, i16 5, i16 8
   %op = add i16 %select, 123
   store i16 %op, i16 addrspace(1)* undef
@@ -445,7 +442,6 @@ define i16 @select_add_trunc_select(i1 %cond) {
 ; IR-LABEL: @select_add_trunc_select(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
 ; IR-NEXT:    ret i16 [[OP]]
-;
   %select = select i1 %cond, i32 5, i32 8
   %trunc = trunc i32 %select to i16
   %op = add i16 %trunc, 42
@@ -473,7 +469,6 @@ define i32 @select_add_zext_select(i1 %cond) {
 ; IR-LABEL: @select_add_zext_select(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 47, i32 50
 ; IR-NEXT:    ret i32 [[OP]]
-
 ; GCN-LABEL: select_add_zext_select:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 7d1bba66c18c..55796554b08e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -105,18 +105,18 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: add_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB0_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32 at abs32@lo
-; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
+; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1064-NEXT:    ds_add_rtn_u32 v1, v2, v1
@@ -125,7 +125,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB0_2:
 ; GFX1064-NEXT:    v_nop
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
@@ -275,17 +275,17 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ;
 ; GFX1064-LABEL: add_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB1_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32 at abs32@lo
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
@@ -298,7 +298,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB1_2:
 ; GFX1064-NEXT:    v_nop
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
@@ -1662,18 +1662,18 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
 ;
 ; GFX1064-LABEL: sub_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s5, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB8_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, local_var32 at abs32@lo
-; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s2, 5
+; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s4, 5
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v2, v1
@@ -1682,7 +1682,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB8_2:
 ; GFX1064-NEXT:    v_nop
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
@@ -1834,17 +1834,17 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ;
 ; GFX1064-LABEL: sub_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064-NEXT:    v_cmp_ne_u32_e64 s[6:7], 1, 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX1064-NEXT:    s_load_dword s0, s[0:1], 0x2c
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s3, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB9_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s1, s[6:7]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, local_var32 at abs32@lo
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mul_i32 s1, s0, s1
@@ -1857,7 +1857,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB9_2:
 ; GFX1064-NEXT:    v_nop
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 0c5c54efd856..2fb68bda542f 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -339,41 +339,41 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_mov_b32 s0, 0xff00ff
-; SI-NEXT:    s_mov_b32 s1, 0xf0f0f0f
-; SI-NEXT:    s_mov_b32 s2, 0xf0f0f0f0
-; SI-NEXT:    s_mov_b32 s3, 0x33333333
-; SI-NEXT:    s_mov_b32 s6, 0xcccccccc
-; SI-NEXT:    s_mov_b32 s8, 0x55555555
-; SI-NEXT:    s_mov_b32 s9, 0xaaaaaaaa
+; SI-NEXT:    s_mov_b32 s6, 0xff00ff
+; SI-NEXT:    s_mov_b32 s8, 0xf0f0f0f
+; SI-NEXT:    s_mov_b32 s9, 0xf0f0f0f0
+; SI-NEXT:    s_mov_b32 s10, 0x33333333
+; SI-NEXT:    s_mov_b32 s11, 0xcccccccc
+; SI-NEXT:    s_mov_b32 s0, 0x55555555
+; SI-NEXT:    s_mov_b32 s1, 0xaaaaaaaa
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v2, v0, v0, 8
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    v_alignbit_b32 v3, v1, v1, 8
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
-; SI-NEXT:    v_bfi_b32 v2, s0, v0, v2
-; SI-NEXT:    v_bfi_b32 v4, s0, v1, v3
-; SI-NEXT:    v_and_b32_e32 v1, s1, v2
-; SI-NEXT:    v_and_b32_e32 v0, s1, v4
-; SI-NEXT:    v_and_b32_e32 v3, s2, v2
-; SI-NEXT:    v_and_b32_e32 v2, s2, v4
+; SI-NEXT:    v_bfi_b32 v2, s6, v0, v2
+; SI-NEXT:    v_bfi_b32 v4, s6, v1, v3
+; SI-NEXT:    v_and_b32_e32 v1, s8, v2
+; SI-NEXT:    v_and_b32_e32 v0, s8, v4
+; SI-NEXT:    v_and_b32_e32 v3, s9, v2
+; SI-NEXT:    v_and_b32_e32 v2, s9, v4
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 4
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 4
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v0
-; SI-NEXT:    v_and_b32_e32 v1, s3, v3
-; SI-NEXT:    v_and_b32_e32 v0, s3, v2
-; SI-NEXT:    v_and_b32_e32 v3, s6, v3
-; SI-NEXT:    v_and_b32_e32 v2, s6, v2
+; SI-NEXT:    v_and_b32_e32 v1, s10, v3
+; SI-NEXT:    v_and_b32_e32 v0, s10, v2
+; SI-NEXT:    v_and_b32_e32 v3, s11, v3
+; SI-NEXT:    v_and_b32_e32 v2, s11, v2
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 2
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v0
-; SI-NEXT:    v_and_b32_e32 v1, s8, v3
-; SI-NEXT:    v_and_b32_e32 v0, s8, v2
-; SI-NEXT:    v_and_b32_e32 v3, s9, v3
-; SI-NEXT:    v_and_b32_e32 v2, s9, v2
+; SI-NEXT:    v_and_b32_e32 v1, s0, v3
+; SI-NEXT:    v_and_b32_e32 v0, s0, v2
+; SI-NEXT:    v_and_b32_e32 v3, s1, v3
+; SI-NEXT:    v_and_b32_e32 v2, s1, v2
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
@@ -386,45 +386,45 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp
 ; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; FLAT-NEXT:    s_mov_b32 s2, 0x10203
-; FLAT-NEXT:    s_mov_b32 s3, 0xf0f0f0f
-; FLAT-NEXT:    s_mov_b32 s6, 0xf0f0f0f0
+; FLAT-NEXT:    s_mov_b32 s6, 0x10203
+; FLAT-NEXT:    s_mov_b32 s2, 0x33333333
+; FLAT-NEXT:    s_mov_b32 s3, 0xcccccccc
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; FLAT-NEXT:    s_mov_b32 s0, 0x33333333
-; FLAT-NEXT:    s_mov_b32 s1, 0xcccccccc
-; FLAT-NEXT:    s_mov_b32 s8, 0x55555555
-; FLAT-NEXT:    s_mov_b32 s9, 0xaaaaaaaa
+; FLAT-NEXT:    s_mov_b32 s0, 0xf0f0f0f
+; FLAT-NEXT:    s_mov_b32 s1, 0xf0f0f0f0
 ; FLAT-NEXT:    s_mov_b32 s7, 0xf000
 ; FLAT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; FLAT-NEXT:    v_perm_b32 v2, 0, v0, s2
-; FLAT-NEXT:    v_perm_b32 v4, 0, v1, s2
-; FLAT-NEXT:    v_and_b32_e32 v1, s3, v2
-; FLAT-NEXT:    v_and_b32_e32 v0, s3, v4
-; FLAT-NEXT:    v_and_b32_e32 v3, s6, v2
-; FLAT-NEXT:    v_and_b32_e32 v2, s6, v4
+; FLAT-NEXT:    v_perm_b32 v2, 0, v0, s6
+; FLAT-NEXT:    v_perm_b32 v4, 0, v1, s6
+; FLAT-NEXT:    v_and_b32_e32 v1, s0, v2
+; FLAT-NEXT:    v_and_b32_e32 v0, s0, v4
+; FLAT-NEXT:    v_and_b32_e32 v3, s1, v2
+; FLAT-NEXT:    v_and_b32_e32 v2, s1, v4
 ; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 4, v[0:1]
 ; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 4, v[2:3]
-; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    s_mov_b32 s0, 0x55555555
 ; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
 ; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT:    v_and_b32_e32 v1, s0, v3
-; FLAT-NEXT:    v_and_b32_e32 v0, s0, v2
-; FLAT-NEXT:    v_and_b32_e32 v3, s1, v3
-; FLAT-NEXT:    v_and_b32_e32 v2, s1, v2
+; FLAT-NEXT:    v_and_b32_e32 v1, s2, v3
+; FLAT-NEXT:    v_and_b32_e32 v0, s2, v2
+; FLAT-NEXT:    v_and_b32_e32 v3, s3, v3
+; FLAT-NEXT:    v_and_b32_e32 v2, s3, v2
 ; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 2, v[2:3]
+; FLAT-NEXT:    s_mov_b32 s1, 0xaaaaaaaa
 ; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
 ; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT:    v_and_b32_e32 v1, s8, v3
-; FLAT-NEXT:    v_and_b32_e32 v0, s8, v2
-; FLAT-NEXT:    v_and_b32_e32 v3, s9, v3
-; FLAT-NEXT:    v_and_b32_e32 v2, s9, v2
+; FLAT-NEXT:    v_and_b32_e32 v1, s0, v3
+; FLAT-NEXT:    v_and_b32_e32 v0, s0, v2
+; FLAT-NEXT:    v_and_b32_e32 v3, s1, v3
+; FLAT-NEXT:    v_and_b32_e32 v2, s1, v2
 ; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; FLAT-NEXT:    s_mov_b32 s6, -1
 ; FLAT-NEXT:    v_or_b32_e32 v1, v3, v1
 ; FLAT-NEXT:    v_or_b32_e32 v0, v2, v0
 ; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -600,13 +600,13 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_mov_b32 s0, 0xff00ff
-; SI-NEXT:    s_mov_b32 s1, 0xf0f0f0f
-; SI-NEXT:    s_mov_b32 s2, 0xf0f0f0f0
-; SI-NEXT:    s_mov_b32 s3, 0x33333333
-; SI-NEXT:    s_mov_b32 s8, 0xcccccccc
-; SI-NEXT:    s_mov_b32 s9, 0x55555555
-; SI-NEXT:    s_mov_b32 s10, 0xaaaaaaaa
+; SI-NEXT:    s_mov_b32 s8, 0xff00ff
+; SI-NEXT:    s_mov_b32 s9, 0xf0f0f0f
+; SI-NEXT:    s_mov_b32 s10, 0xf0f0f0f0
+; SI-NEXT:    s_mov_b32 s11, 0x33333333
+; SI-NEXT:    s_mov_b32 s12, 0xcccccccc
+; SI-NEXT:    s_mov_b32 s13, 0x55555555
+; SI-NEXT:    s_mov_b32 s14, 0xaaaaaaaa
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v4, v2, v2, 8
@@ -617,18 +617,18 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
 ; SI-NEXT:    v_alignbit_b32 v7, v1, v1, 8
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
 ; SI-NEXT:    v_alignbit_b32 v3, v3, v3, 24
-; SI-NEXT:    v_bfi_b32 v2, s0, v2, v4
-; SI-NEXT:    v_bfi_b32 v4, s0, v3, v5
-; SI-NEXT:    v_bfi_b32 v6, s0, v0, v6
-; SI-NEXT:    v_bfi_b32 v8, s0, v1, v7
-; SI-NEXT:    v_and_b32_e32 v1, s1, v2
-; SI-NEXT:    v_and_b32_e32 v0, s1, v4
-; SI-NEXT:    v_and_b32_e32 v3, s2, v2
-; SI-NEXT:    v_and_b32_e32 v2, s2, v4
-; SI-NEXT:    v_and_b32_e32 v5, s1, v6
-; SI-NEXT:    v_and_b32_e32 v4, s1, v8
-; SI-NEXT:    v_and_b32_e32 v7, s2, v6
-; SI-NEXT:    v_and_b32_e32 v6, s2, v8
+; SI-NEXT:    v_bfi_b32 v2, s8, v2, v4
+; SI-NEXT:    v_bfi_b32 v4, s8, v3, v5
+; SI-NEXT:    v_bfi_b32 v6, s8, v0, v6
+; SI-NEXT:    v_bfi_b32 v8, s8, v1, v7
+; SI-NEXT:    v_and_b32_e32 v1, s9, v2
+; SI-NEXT:    v_and_b32_e32 v0, s9, v4
+; SI-NEXT:    v_and_b32_e32 v3, s10, v2
+; SI-NEXT:    v_and_b32_e32 v2, s10, v4
+; SI-NEXT:    v_and_b32_e32 v5, s9, v6
+; SI-NEXT:    v_and_b32_e32 v4, s9, v8
+; SI-NEXT:    v_and_b32_e32 v7, s10, v6
+; SI-NEXT:    v_and_b32_e32 v6, s10, v8
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 4
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 4
 ; SI-NEXT:    v_lshl_b64 v[4:5], v[4:5], 4
@@ -637,14 +637,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v0
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v4
-; SI-NEXT:    v_and_b32_e32 v1, s3, v3
-; SI-NEXT:    v_and_b32_e32 v0, s3, v2
-; SI-NEXT:    v_and_b32_e32 v5, s3, v7
-; SI-NEXT:    v_and_b32_e32 v4, s3, v6
-; SI-NEXT:    v_and_b32_e32 v3, s8, v3
-; SI-NEXT:    v_and_b32_e32 v2, s8, v2
-; SI-NEXT:    v_and_b32_e32 v7, s8, v7
-; SI-NEXT:    v_and_b32_e32 v6, s8, v6
+; SI-NEXT:    v_and_b32_e32 v1, s11, v3
+; SI-NEXT:    v_and_b32_e32 v0, s11, v2
+; SI-NEXT:    v_and_b32_e32 v5, s11, v7
+; SI-NEXT:    v_and_b32_e32 v4, s11, v6
+; SI-NEXT:    v_and_b32_e32 v3, s12, v3
+; SI-NEXT:    v_and_b32_e32 v2, s12, v2
+; SI-NEXT:    v_and_b32_e32 v7, s12, v7
+; SI-NEXT:    v_and_b32_e32 v6, s12, v6
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 2
 ; SI-NEXT:    v_lshl_b64 v[4:5], v[4:5], 2
@@ -653,14 +653,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v0
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v4
-; SI-NEXT:    v_and_b32_e32 v1, s9, v3
-; SI-NEXT:    v_and_b32_e32 v0, s9, v2
-; SI-NEXT:    v_and_b32_e32 v5, s9, v7
-; SI-NEXT:    v_and_b32_e32 v4, s9, v6
-; SI-NEXT:    v_and_b32_e32 v3, s10, v3
-; SI-NEXT:    v_and_b32_e32 v2, s10, v2
-; SI-NEXT:    v_and_b32_e32 v7, s10, v7
-; SI-NEXT:    v_and_b32_e32 v6, s10, v6
+; SI-NEXT:    v_and_b32_e32 v1, s13, v3
+; SI-NEXT:    v_and_b32_e32 v0, s13, v2
+; SI-NEXT:    v_and_b32_e32 v5, s13, v7
+; SI-NEXT:    v_and_b32_e32 v4, s13, v6
+; SI-NEXT:    v_and_b32_e32 v3, s14, v3
+; SI-NEXT:    v_and_b32_e32 v2, s14, v2
+; SI-NEXT:    v_and_b32_e32 v7, s14, v7
+; SI-NEXT:    v_and_b32_e32 v6, s14, v6
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
 ; SI-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
@@ -677,33 +677,33 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
 ; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; FLAT-NEXT:    s_mov_b32 s2, 0x10203
-; FLAT-NEXT:    s_mov_b32 s3, 0xf0f0f0f
-; FLAT-NEXT:    s_mov_b32 s8, 0xf0f0f0f0
+; FLAT-NEXT:    s_mov_b32 s10, 0x10203
+; FLAT-NEXT:    s_mov_b32 s2, 0x33333333
+; FLAT-NEXT:    s_mov_b32 s3, 0xcccccccc
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLAT-NEXT:    v_mov_b32_e32 v1, s1
 ; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; FLAT-NEXT:    s_mov_b32 s0, 0x33333333
-; FLAT-NEXT:    s_mov_b32 s1, 0xcccccccc
-; FLAT-NEXT:    s_mov_b32 s9, 0x55555555
-; FLAT-NEXT:    s_mov_b32 s10, 0xaaaaaaaa
+; FLAT-NEXT:    s_mov_b32 s0, 0xf0f0f0f
+; FLAT-NEXT:    s_mov_b32 s1, 0xf0f0f0f0
+; FLAT-NEXT:    s_mov_b32 s8, 0x55555555
+; FLAT-NEXT:    s_mov_b32 s9, 0xaaaaaaaa
 ; FLAT-NEXT:    s_mov_b32 s7, 0xf000
 ; FLAT-NEXT:    s_mov_b32 s6, -1
 ; FLAT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; FLAT-NEXT:    v_perm_b32 v6, 0, v0, s2
-; FLAT-NEXT:    v_perm_b32 v4, 0, v3, s2
-; FLAT-NEXT:    v_perm_b32 v2, 0, v2, s2
-; FLAT-NEXT:    v_perm_b32 v8, 0, v1, s2
-; FLAT-NEXT:    v_and_b32_e32 v1, s3, v2
-; FLAT-NEXT:    v_and_b32_e32 v0, s3, v4
-; FLAT-NEXT:    v_and_b32_e32 v3, s8, v2
-; FLAT-NEXT:    v_and_b32_e32 v2, s8, v4
-; FLAT-NEXT:    v_and_b32_e32 v5, s3, v6
-; FLAT-NEXT:    v_and_b32_e32 v4, s3, v8
-; FLAT-NEXT:    v_and_b32_e32 v7, s8, v6
-; FLAT-NEXT:    v_and_b32_e32 v6, s8, v8
+; FLAT-NEXT:    v_perm_b32 v6, 0, v0, s10
+; FLAT-NEXT:    v_perm_b32 v4, 0, v3, s10
+; FLAT-NEXT:    v_perm_b32 v2, 0, v2, s10
+; FLAT-NEXT:    v_perm_b32 v8, 0, v1, s10
+; FLAT-NEXT:    v_and_b32_e32 v1, s0, v2
+; FLAT-NEXT:    v_and_b32_e32 v0, s0, v4
+; FLAT-NEXT:    v_and_b32_e32 v3, s1, v2
+; FLAT-NEXT:    v_and_b32_e32 v2, s1, v4
+; FLAT-NEXT:    v_and_b32_e32 v5, s0, v6
+; FLAT-NEXT:    v_and_b32_e32 v4, s0, v8
+; FLAT-NEXT:    v_and_b32_e32 v7, s1, v6
+; FLAT-NEXT:    v_and_b32_e32 v6, s1, v8
 ; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 4, v[0:1]
 ; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 4, v[2:3]
 ; FLAT-NEXT:    v_lshlrev_b64 v[4:5], 4, v[4:5]
@@ -712,14 +712,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
 ; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
 ; FLAT-NEXT:    v_or_b32_e32 v7, v7, v5
 ; FLAT-NEXT:    v_or_b32_e32 v6, v6, v4
-; FLAT-NEXT:    v_and_b32_e32 v1, s0, v3
-; FLAT-NEXT:    v_and_b32_e32 v0, s0, v2
-; FLAT-NEXT:    v_and_b32_e32 v5, s0, v7
-; FLAT-NEXT:    v_and_b32_e32 v4, s0, v6
-; FLAT-NEXT:    v_and_b32_e32 v3, s1, v3
-; FLAT-NEXT:    v_and_b32_e32 v2, s1, v2
-; FLAT-NEXT:    v_and_b32_e32 v7, s1, v7
-; FLAT-NEXT:    v_and_b32_e32 v6, s1, v6
+; FLAT-NEXT:    v_and_b32_e32 v1, s2, v3
+; FLAT-NEXT:    v_and_b32_e32 v0, s2, v2
+; FLAT-NEXT:    v_and_b32_e32 v5, s2, v7
+; FLAT-NEXT:    v_and_b32_e32 v4, s2, v6
+; FLAT-NEXT:    v_and_b32_e32 v3, s3, v3
+; FLAT-NEXT:    v_and_b32_e32 v2, s3, v2
+; FLAT-NEXT:    v_and_b32_e32 v7, s3, v7
+; FLAT-NEXT:    v_and_b32_e32 v6, s3, v6
 ; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 2, v[2:3]
 ; FLAT-NEXT:    v_lshlrev_b64 v[4:5], 2, v[4:5]
@@ -728,14 +728,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
 ; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
 ; FLAT-NEXT:    v_or_b32_e32 v7, v7, v5
 ; FLAT-NEXT:    v_or_b32_e32 v6, v6, v4
-; FLAT-NEXT:    v_and_b32_e32 v1, s9, v3
-; FLAT-NEXT:    v_and_b32_e32 v0, s9, v2
-; FLAT-NEXT:    v_and_b32_e32 v5, s9, v7
-; FLAT-NEXT:    v_and_b32_e32 v4, s9, v6
-; FLAT-NEXT:    v_and_b32_e32 v3, s10, v3
-; FLAT-NEXT:    v_and_b32_e32 v2, s10, v2
-; FLAT-NEXT:    v_and_b32_e32 v7, s10, v7
-; FLAT-NEXT:    v_and_b32_e32 v6, s10, v6
+; FLAT-NEXT:    v_and_b32_e32 v1, s8, v3
+; FLAT-NEXT:    v_and_b32_e32 v0, s8, v2
+; FLAT-NEXT:    v_and_b32_e32 v5, s8, v7
+; FLAT-NEXT:    v_and_b32_e32 v4, s8, v6
+; FLAT-NEXT:    v_and_b32_e32 v3, s9, v3
+; FLAT-NEXT:    v_and_b32_e32 v2, s9, v2
+; FLAT-NEXT:    v_and_b32_e32 v7, s9, v7
+; FLAT-NEXT:    v_and_b32_e32 v6, s9, v6
 ; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
 ; FLAT-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index f6672548a94c..9ac1570fab7c 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -661,11 +661,7 @@ define i32 @sdiv32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT:    v_xor_b32_e32 v2, v4, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v1
@@ -674,10 +670,14 @@ define i32 @sdiv32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v3, v4
+; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v5
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, v3, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v3, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v1
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v6, -1, v3
@@ -734,33 +734,33 @@ define i32 @srem32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v4, v2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v6, 0, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v4, v4, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v2, v4
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v5, 0, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v3, v3, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, v2, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v4, v0, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
-; GFX9-NEXT:    v_sub_u32_e32 v0, v4, v1
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
+; GFX9-NEXT:    v_sub_u32_e32 v0, v3, v1
 ; GFX9-NEXT:    s_and_b64 vcc, vcc, s[4:5]
-; GFX9-NEXT:    v_add_u32_e32 v5, v4, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_add_u32_e32 v5, v3, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v5, v0, s[4:5]
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = srem i32 %a, %b
   ret i32 %d

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 7a528ef70ced..29ab79ddf201 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -8,37 +8,37 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
 ; SI-LABEL: test_copy_v4i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_copy_v4i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@@ -72,16 +72,16 @@ define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x
 ; VI-LABEL: test_copy_v4i8_x2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s8, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    s_mov_b32 s0, s6
 ; VI-NEXT:    s_mov_b32 s1, s7
 ; VI-NEXT:    s_mov_b32 s6, s2
 ; VI-NEXT:    s_mov_b32 s7, s3
@@ -102,25 +102,25 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s14, 0
-; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b32 s18, 0
+; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
 ; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s8, s4
-; SI-NEXT:    s_mov_b32 s9, s5
-; SI-NEXT:    s_mov_b32 s4, s2
-; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_mov_b32 s12, s2
+; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    s_mov_b32 s2, s10
 ; SI-NEXT:    s_mov_b32 s3, s11
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    s_mov_b32 s8, s4
+; SI-NEXT:    s_mov_b32 s9, s5
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -160,61 +160,60 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
 define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
 ; SI-LABEL: test_copy_v4i8_x4:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x11
-; SI-NEXT:    s_mov_b32 s15, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s15
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s14, 0
+; SI-NEXT:    s_mov_b32 s15, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s14, -1
-; SI-NEXT:    s_mov_b32 s18, s14
-; SI-NEXT:    s_mov_b32 s19, s15
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s16, s2
-; SI-NEXT:    s_mov_b32 s17, s3
-; SI-NEXT:    s_mov_b32 s2, s14
-; SI-NEXT:    s_mov_b32 s3, s15
-; SI-NEXT:    s_mov_b32 s12, s6
-; SI-NEXT:    s_mov_b32 s13, s7
-; SI-NEXT:    s_mov_b32 s6, s14
-; SI-NEXT:    s_mov_b32 s7, s15
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s16, s6
+; SI-NEXT:    s_mov_b32 s17, s7
+; SI-NEXT:    s_mov_b32 s6, s2
+; SI-NEXT:    s_mov_b32 s7, s3
+; SI-NEXT:    s_mov_b32 s0, s10
+; SI-NEXT:    s_mov_b32 s1, s11
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_mov_b32 s18, s2
+; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_copy_v4i8_x4:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
+; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s11, 0xf000
-; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s8, v0
+; VI-NEXT:    s_mov_b32 s0, s10
+; VI-NEXT:    v_mov_b32_e32 v1, s13
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s12, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s12, s2
-; VI-NEXT:    s_mov_b32 s13, s3
-; VI-NEXT:    s_mov_b32 s2, s10
-; VI-NEXT:    s_mov_b32 s3, s11
-; VI-NEXT:    s_mov_b32 s8, s6
-; VI-NEXT:    s_mov_b32 s9, s7
-; VI-NEXT:    s_mov_b32 s6, s10
-; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    s_mov_b32 s14, s10
-; VI-NEXT:    s_mov_b32 s15, s11
+; VI-NEXT:    s_mov_b32 s12, s6
+; VI-NEXT:    s_mov_b32 s13, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_mov_b32 s1, s11
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_mov_b32 s14, s2
+; VI-NEXT:    s_mov_b32 s15, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@@ -230,76 +229,75 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
 ; SI-LABEL: test_copy_v4i8_extra_use:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s0, 0xff00
-; SI-NEXT:    s_mov_b32 s8, s6
-; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    s_movk_i32 s1, 0xff
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    s_mov_b32 s12, 0xff00
+; SI-NEXT:    s_movk_i32 s13, 0xff
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s6
+; SI-NEXT:    s_mov_b32 s1, s7
+; SI-NEXT:    s_mov_b32 s6, s2
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    v_and_b32_e32 v2, s0, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 9, v0
-; SI-NEXT:    v_and_b32_e32 v0, s1, v0
-; SI-NEXT:    v_and_b32_e32 v3, s0, v1
+; SI-NEXT:    v_and_b32_e32 v4, s12, v1
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
-; SI-NEXT:    v_or_b32_e32 v0, v2, v0
-; SI-NEXT:    v_and_b32_e32 v1, s1, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v2, s12, v0
+; SI-NEXT:    v_and_b32_e32 v3, s13, v3
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v1, s13, v1
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x900, v2
+; SI-NEXT:    v_or_b32_e32 v1, v4, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x9000000, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_copy_v4i8_extra_use:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_movk_i32 s8, 0xff00
+; VI-NEXT:    s_movk_i32 s10, 0x900
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s8, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    s_movk_i32 s8, 0xff00
+; VI-NEXT:    s_movk_i32 s9, 0xff
 ; VI-NEXT:    s_mov_b32 s0, s6
 ; VI-NEXT:    s_mov_b32 s1, s7
-; VI-NEXT:    s_movk_i32 s9, 0xff
 ; VI-NEXT:    s_mov_b32 s6, s2
 ; VI-NEXT:    s_mov_b32 s7, s3
-; VI-NEXT:    s_movk_i32 s10, 0x900
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT:    v_and_b32_e32 v3, s8, v1
+; VI-NEXT:    v_and_b32_e32 v4, s8, v1
 ; VI-NEXT:    v_add_u16_e32 v1, 9, v1
+; VI-NEXT:    v_add_u16_e32 v3, 9, v0
 ; VI-NEXT:    v_and_b32_e32 v1, s9, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-NEXT:    v_and_b32_e32 v2, s8, v0
-; VI-NEXT:    v_add_u16_e32 v0, 9, v0
-; VI-NEXT:    v_and_b32_e32 v0, s9, v0
-; VI-NEXT:    v_or_b32_e32 v1, v3, v1
-; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    v_and_b32_e32 v3, s9, v3
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    v_add_u16_e32 v1, s10, v1
-; VI-NEXT:    v_add_u16_e32 v0, s10, v0
+; VI-NEXT:    v_add_u16_e32 v2, s10, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@@ -316,33 +314,33 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s14, 0
-; SI-NEXT:    s_mov_b32 s15, s3
+; SI-NEXT:    s_mov_b32 s18, 0
+; SI-NEXT:    s_mov_b32 s19, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[12:13], s[10:11]
+; SI-NEXT:    s_mov_b64 s[16:17], s[10:11]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; SI-NEXT:    s_mov_b32 s16, 0xff00
-; SI-NEXT:    s_movk_i32 s17, 0xff
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
 ; SI-NEXT:    s_mov_b32 s0, s8
 ; SI-NEXT:    s_mov_b32 s1, s9
-; SI-NEXT:    s_mov_b32 s8, s6
-; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_mov_b32 s8, 0xff00
+; SI-NEXT:    s_movk_i32 s9, 0xff
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s14, s2
+; SI-NEXT:    s_mov_b32 s15, s3
 ; SI-NEXT:    s_mov_b32 s6, s2
 ; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT:    v_and_b32_e32 v4, s16, v1
+; SI-NEXT:    v_and_b32_e32 v4, s8, v1
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
-; SI-NEXT:    v_and_b32_e32 v2, s16, v0
-; SI-NEXT:    v_and_b32_e32 v3, s17, v3
+; SI-NEXT:    v_and_b32_e32 v2, s8, v0
+; SI-NEXT:    v_and_b32_e32 v3, s9, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_and_b32_e32 v1, s17, v1
+; SI-NEXT:    v_and_b32_e32 v1, s9, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x900, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v4, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -350,7 +348,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x9000000, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
+; SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -358,39 +356,41 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_movk_i32 s12, 0xff00
-; VI-NEXT:    s_movk_i32 s13, 0xff
-; VI-NEXT:    s_movk_i32 s14, 0x900
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s14, s10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_mov_b32 s2, s6
-; VI-NEXT:    s_mov_b32 s3, s7
+; VI-NEXT:    s_mov_b32 s8, s4
+; VI-NEXT:    s_movk_i32 s4, 0xff00
+; VI-NEXT:    s_mov_b32 s9, s5
+; VI-NEXT:    s_movk_i32 s5, 0xff
+; VI-NEXT:    s_movk_i32 s6, 0x900
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s15, s11
+; VI-NEXT:    s_mov_b32 s2, s10
+; VI-NEXT:    s_mov_b32 s3, s11
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT:    v_and_b32_e32 v4, s12, v1
+; VI-NEXT:    v_and_b32_e32 v4, s4, v1
 ; VI-NEXT:    v_add_u16_e32 v1, 9, v1
 ; VI-NEXT:    v_add_u16_e32 v3, 9, v0
-; VI-NEXT:    v_and_b32_e32 v1, s13, v1
+; VI-NEXT:    v_and_b32_e32 v1, s5, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v4, v1
-; VI-NEXT:    v_and_b32_e32 v2, s12, v0
-; VI-NEXT:    v_and_b32_e32 v3, s13, v3
+; VI-NEXT:    v_and_b32_e32 v2, s4, v0
+; VI-NEXT:    v_and_b32_e32 v3, s5, v3
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    v_add_u16_e32 v1, s14, v1
-; VI-NEXT:    v_add_u16_e32 v2, s14, v2
+; VI-NEXT:    v_add_u16_e32 v1, s6, v1
+; VI-NEXT:    v_add_u16_e32 v2, s6, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v1, off, s[12:15], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@@ -405,41 +405,41 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
 ; SI-LABEL: test_copy_v3i8_align4:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_copy_v3i8_align4:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x

diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index ce18967276d6..dc8d630bc7d4 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -65,21 +65,22 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
 define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_i32:
@@ -131,16 +132,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
 define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_v2i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v2, v1
 ; SI-NEXT:    v_ffbh_u32_e32 v3, v0
@@ -148,7 +149,8 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v3, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_v2i32:
@@ -206,16 +208,16 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
 define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_v4i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v4, v3
 ; SI-NEXT:    v_ffbh_u32_e32 v5, v2
@@ -229,7 +231,8 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 32, v6, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v7, vcc
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_v4i32:
@@ -299,9 +302,9 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
 define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_i8:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s6, s2
 ; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -501,7 +504,6 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
 define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
 ; SI-LABEL: v_ctlz_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -509,7 +511,7 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v4, v2
 ; SI-NEXT:    v_ffbh_u32_e32 v5, v3
@@ -520,7 +522,8 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 64, v3, vcc
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_i64:
@@ -588,7 +591,6 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp
 define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
 ; SI-LABEL: v_ctlz_i64_trunc:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -596,8 +598,8 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v3
 ; SI-NEXT:    v_ffbh_u32_e32 v5, v4
@@ -607,7 +609,8 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; SI-NEXT:    buffer_store_dword v0, v[1:2], s[8:11], 0 addr64
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_i64_trunc:
@@ -615,26 +618,26 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, 0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v4, s3
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, v5, v3, vcc
-; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v5, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v0
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[1:2]
+; VI-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v4, v0
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v4
+; VI-NEXT:    v_ffbh_u32_e32 v2, v0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
 ; VI-NEXT:    v_ffbh_u32_e32 v5, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, 64, v1, vcc
-; VI-NEXT:    flat_store_dword v[2:3], v0
+; VI-NEXT:    flat_store_dword v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_i64_trunc:
@@ -676,19 +679,20 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
 define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s11, s7
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
@@ -742,19 +746,20 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out
 define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s11, s7
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
@@ -809,23 +814,24 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out
 define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
@@ -885,23 +891,24 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
 define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
@@ -961,18 +968,19 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
  define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
@@ -1030,9 +1038,9 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
  define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s6, s2
 ; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1102,19 +1110,20 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
 define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_ctlz_i7_sel_eq_neg1:

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 3ad8e431ad98..2ea7dfdd1354 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -261,19 +261,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
 ; SI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    s_mov_b32 s7, s3
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s11
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
 ; SI-NEXT:    s_movk_i32 s12, 0xff
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
@@ -284,7 +284,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, s12, v4
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 9, v5
@@ -297,44 +297,44 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_mov_b32_e32 v4, 9
-; VI-NEXT:    s_movk_i32 s8, 0x900
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s6, s2
-; VI-NEXT:    s_mov_b32 s7, s3
-; VI-NEXT:    v_mov_b32_e32 v6, s8
+; VI-NEXT:    v_mov_b32_e32 v4, 9
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    s_movk_i32 s0, 0x900
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 24, v5
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v5
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v5
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v5
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT:    v_and_b32_e32 v8, 0xffffff00, v5
-; VI-NEXT:    v_add_u16_e32 v9, 9, v5
+; VI-NEXT:    v_and_b32_e32 v7, 0xffffff00, v5
+; VI-NEXT:    v_add_u16_e32 v8, 9, v5
 ; VI-NEXT:    v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
-; VI-NEXT:    v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
+; VI-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v0, s8, v0
-; VI-NEXT:    v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_add_u16_e32 v0, s0, v0
+; VI-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@@ -358,31 +358,30 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
-; SI-NEXT:    buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
-; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT:    buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
-; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
-; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6
+; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5
+; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6
+; SI-NEXT:    buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:1
+; SI-NEXT:    buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:2
+; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3
+; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:4
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v3
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v9, v3
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
+; SI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v4
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v7, v0
-; SI-NEXT:    v_or_b32_e32 v0, v8, v6
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
+; SI-NEXT:    v_or_b32_e32 v0, v8, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_or_b32_e32 v6, v0, v2
+; SI-NEXT:    v_or_b32_e32 v6, v0, v6
+; SI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v2
 ; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v6
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:24
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:24
 ; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -402,33 +401,35 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v10, v[2:3]
-; VI-NEXT:    flat_load_ubyte v11, v[4:5]
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 6, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 5, v0
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 6, v0
 ; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 1, v0
+; VI-NEXT:    v_add_u32_e32 v8, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v3, v[4:5]
-; VI-NEXT:    flat_load_ubyte v4, v[6:7]
-; VI-NEXT:    flat_load_ubyte v5, v[8:9]
+; VI-NEXT:    v_add_u32_e32 v10, vcc, 5, v0
+; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v12, vcc, 1, v0
+; VI-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ubyte v6, v[6:7]
+; VI-NEXT:    flat_load_ubyte v7, v[8:9]
+; VI-NEXT:    flat_load_ubyte v8, v[10:11]
+; VI-NEXT:    flat_load_ubyte v9, v[12:13]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v1, v[2:3]
+; VI-NEXT:    flat_load_ubyte v2, v[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; VI-NEXT:    v_lshlrev_b32_e32 v7, 8, v10
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v6
+; VI-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; VI-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v8
+; VI-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v5
-; VI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v4
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v3
-; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
+; VI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v9
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
 ; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -880,42 +881,42 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out
 define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
 ; SI-LABEL: cvt_ubyte0_or_multiuse:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s2
-; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_mov_b32 s0, s6
+; SI-NEXT:    s_mov_b32 s1, s7
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, s3
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: cvt_ubyte0_or_multiuse:
 ; VI:       ; %bb.0: ; %bb
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s4, s2
-; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    s_mov_b32 s0, s6
+; VI-NEXT:    s_mov_b32 s1, s7
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index 0053d2f3019d..6cdd03b75491 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -9,11 +9,10 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
-; GFX7-ALIGNED-NEXT:    buffer_load_ushort v1, v1, s[0:3], s33 offen
 ; GFX7-ALIGNED-NEXT:    buffer_load_ushort v0, v0, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-ALIGNED-NEXT:    buffer_load_ushort v1, v1, s[0:3], s33 offen
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -21,11 +20,10 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
-; GFX7-UNALIGNED-NEXT:    buffer_load_ushort v1, v1, s[0:3], s33 offen
 ; GFX7-UNALIGNED-NEXT:    buffer_load_ushort v0, v0, s[0:3], s33 offen
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-UNALIGNED-NEXT:    buffer_load_ushort v1, v1, s[0:3], s33 offen
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-UNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -91,21 +89,21 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
 ; GFX7-ALIGNED-LABEL: private_load_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 3, v0
-; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
-; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v1, v1, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v3, v3, s[0:3], s33 offen
+; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v2, v2, s[0:3], s33 offen
+; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
+; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v3, vcc, 3, v0
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v3, v3, s[0:3], s33 offen
+; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v1, v1, s[0:3], s33 offen
 ; GFX7-ALIGNED-NEXT:    buffer_load_ubyte v0, v0, s[0:3], s33 offen
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
index 117307968502..894d6e6d1b31 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -7,23 +7,23 @@
 define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmax_legacy_uge_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmax_legacy_uge_f64:
@@ -59,23 +59,23 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, d
 define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmax_legacy_oge_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmax_legacy_oge_f64:
@@ -111,23 +111,23 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, d
 define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmax_legacy_ugt_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmax_legacy_ugt_f64:
@@ -163,23 +163,23 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, d
 define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmax_legacy_ogt_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmax_legacy_ogt_f64:

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index 306b4c25fb92..108d914add95 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -5,23 +5,23 @@
 define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmin_legacy_uge_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmin_legacy_uge_f64:
@@ -57,23 +57,23 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, d
 define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmin_legacy_ugt_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmin_legacy_ugt_f64:
@@ -109,23 +109,23 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, d
 define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmin_legacy_ule_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmin_legacy_ule_f64:
@@ -161,23 +161,23 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, d
 define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmin_legacy_ult_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmin_legacy_ult_f64:
@@ -213,23 +213,23 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, d
 define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmin_legacy_oge_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmin_legacy_oge_f64:
@@ -265,23 +265,23 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, d
 define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmin_legacy_ogt_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmin_legacy_ogt_f64:
@@ -317,23 +317,23 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, d
 define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmin_legacy_ole_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_le_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmin_legacy_ole_f64:
@@ -369,23 +369,23 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, d
 define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 ; SI-LABEL: test_fmin_legacy_olt_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_fmin_legacy_olt_f64:

diff  --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index 860ede2713b8..bb32ecdb9f7d 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -110,18 +110,18 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_log_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v3, v1
+; GFX6-NEXT:    v_log_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v2, v0
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v3, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -137,30 +137,30 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_log_f32_e32 v2, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
   ret <2 x half> %pow
@@ -172,21 +172,21 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_log_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_log_f32_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_log_f32_e32 v4, v0
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v1, v2
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v2, v3
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v3, v4
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -201,30 +201,30 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v2, -v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_log_f32_e32 v2, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
@@ -266,30 +266,30 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v3, -v1
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; GFX9-NEXT:    v_log_f32_e32 v2, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %y.fneg = fneg <2 x half> %y
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
@@ -336,30 +336,30 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v2, -v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v3, -v1
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; GFX9-NEXT:    v_log_f32_e32 v2, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %y.fneg = fneg <2 x half> %y

diff  --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 15f9fb2a6bfb..d9699fe4ce66 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -24,25 +24,25 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:  BB0_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s7
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s6
-; GFX9-NEXT:    v_add_u32_e32 v3, v2, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v3, s2
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v3
-; GFX9-NEXT:    v_add_u32_e32 v6, -1, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, s6, v1
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, s6, v4
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s7
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, v3, s2
+; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
+; GFX9-NEXT:    v_add_u32_e32 v7, -1, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, s6, v4
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, s6, v5
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v4
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
 ; GFX9-NEXT:    s_add_u32 s6, s6, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NEXT:    s_add_u32 s4, s4, 4
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX9-NEXT:    s_cmpk_eq_i32 s6, 0x400
 ; GFX9-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX9-NEXT:    s_cbranch_scc0 BB0_1
@@ -88,29 +88,29 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:  BB1_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s7
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s6
-; GFX9-NEXT:    v_add_u32_e32 v3, v2, v1
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s2
-; GFX9-NEXT:    v_sub_u32_e32 v5, 1, v3
-; GFX9-NEXT:    v_not_b32_e32 v3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s7
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v3, s2
+; GFX9-NEXT:    v_not_b32_e32 v6, v3
+; GFX9-NEXT:    v_sub_u32_e32 v3, 1, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, s6, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], s6, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, s2, v6
+; GFX9-NEXT:    v_add_u32_e32 v5, s6, v5
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], s6, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v5
 ; GFX9-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v4, s6, v6
 ; GFX9-NEXT:    v_add_u32_e32 v3, s6, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, s6, v5
 ; GFX9-NEXT:    s_add_u32 s6, s6, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    s_addc_u32 s7, s7, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NEXT:    s_add_u32 s4, s4, 4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
 ; GFX9-NEXT:    s_cmpk_eq_i32 s6, 0x400
 ; GFX9-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX9-NEXT:    s_cbranch_scc0 BB1_1
@@ -162,15 +162,15 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v3, s3
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX9-NEXT:    v_add_u32_e32 v7, -1, v3
-; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
+; GFX9-NEXT:    v_add_u32_e32 v6, -1, v3
+; GFX9-NEXT:    v_sub_u32_e32 v7, s6, v4
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, s6, v4
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v5
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX9-NEXT:    s_add_i32 s6, s6, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; GFX9-NEXT:    s_add_u32 s4, s4, 4
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
@@ -222,10 +222,10 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:  BB3_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s2
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s3, v3
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], s3, v3
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
@@ -275,19 +275,19 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-NEXT:  BB4_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_and_b32_e32 v2, s2, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v2, v6, s[0:1]
-; GFX9-NEXT:    v_mul_f32_e32 v2, v7, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v8, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v2
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
-; GFX9-NEXT:    v_mad_f32 v2, -v2, v0, v7
+; GFX9-NEXT:    v_mad_f32 v2, -v2, v0, v8
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, v0
 ; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s3, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1]
 ; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
 ; GFX9-NEXT:    global_store_short v[5:6], v2, off
 ; GFX9-NEXT:    s_cbranch_vccz BB4_1
@@ -326,15 +326,15 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-NEXT:  BB5_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_and_b32_e32 v2, s2, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
-; GFX9-NEXT:    v_mul_f32_e32 v8, v7, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v8, v8
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v8
-; GFX9-NEXT:    v_mad_f32 v7, -v8, v0, v7
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
+; GFX9-NEXT:    v_mul_f32_e32 v7, v8, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v7
+; GFX9-NEXT:    v_mad_f32 v7, -v7, v0, v8
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v7|, v0
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
 ; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index b917e86821a5..2844898bf2f3 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -102,18 +102,18 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot2:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s4, s3, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s1, s0, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                  <2 x i16> addrspace(1)* %src2,
@@ -246,19 +246,19 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL:       ; %bb.0: ; %entry
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s6, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT:    s_and_b32 s6, s2, s4
+; GFX10-DL-NEXT:    s_and_b32 s4, s3, s4
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v0, s2, s6
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s4, s3, v0
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v0, s4, s6
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s5, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
@@ -377,18 +377,18 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: idot2:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot2_i32_i16 v2, s4, s3, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot2_i32_i16 v2, s1, s0, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                  <2 x i16> addrspace(1)* %src2,
@@ -648,18 +648,18 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot2_alt_AddOperands:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s4, s3, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s1, s0, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                                  <2 x i16> addrspace(1)* %src2,
@@ -854,14 +854,14 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT:    v_mad_u32_u24 v0, s2, s2, v0
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, s4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, s0, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, s1, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -873,14 +873,14 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NODL-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v0, s2, s2, v0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, s4, v0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, s0, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, s1, v2
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -892,14 +892,14 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-DL-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX9-DL-NEXT:    v_mad_u32_u24 v0, s2, s2, v0
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, s4, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-DL-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, s0, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, s1, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1039,18 +1039,18 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot2_v4i16:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s4, s3, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s1, s0, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i16> addrspace(1)* %src2,
@@ -1171,18 +1171,18 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot2_v4i16_Hi:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x4
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x4
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s4, s3, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s1, s0, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                           <4 x i16> addrspace(1)* %src2,
@@ -1312,18 +1312,18 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL:       ; %bb.0: ; %entry
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_mov_b32 s7, 0xffff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s3, s3, s8
-; GFX10-DL-NEXT:    s_and_b32 s5, s5, s8
+; GFX10-DL-NEXT:    s_and_b32 s3, s3, s7
+; GFX10-DL-NEXT:    s_and_b32 s5, s5, s7
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT:    s_and_b32 s2, s2, s8
-; GFX10-DL-NEXT:    s_and_b32 s4, s4, s8
+; GFX10-DL-NEXT:    s_and_b32 s2, s2, s7
+; GFX10-DL-NEXT:    s_and_b32 s4, s4, s7
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s5, s3, v0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s2, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1457,15 +1457,15 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL:       ; %bb.0: ; %entry
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_mov_b32 s7, 0xffff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s3, s3, s8
-; GFX10-DL-NEXT:    s_and_b32 s5, s5, s8
+; GFX10-DL-NEXT:    s_and_b32 s3, s3, s7
+; GFX10-DL-NEXT:    s_and_b32 s5, s5, s7
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 16
@@ -1602,18 +1602,18 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL:       ; %bb.0: ; %entry
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX10-DL-NEXT:    s_and_b32 s7, s4, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT:    s_and_b32 s2, s3, s2
-; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX10-DL-NEXT:    s_and_b32 s7, s3, s5
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    s_and_b32 s2, s2, s5
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s7, s6, v0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1751,20 +1751,20 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL:       ; %bb.0: ; %entry
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT:    s_and_b32 s3, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    s_and_b32 s2, s2, s5
+; GFX10-DL-NEXT:    s_and_b32 s3, s3, s5
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s7, s6, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s3, s2, v0
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
@@ -2049,21 +2049,21 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL:       ; %bb.0: ; %entry
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s6, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-DL-NEXT:    s_and_b32 s6, s2, s5
+; GFX10-DL-NEXT:    s_and_b32 s5, s3, s5
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s6, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s4, s3, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s6, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s5, s6, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s3, s2, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
@@ -2349,21 +2349,21 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL:       ; %bb.0: ; %entry
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX10-DL-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX10-DL-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT:    s_and_b32 s3, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s7, s6, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s7, s6, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    s_mov_b32 s4, 0xffff
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s5, v0
+; GFX10-DL-NEXT:    s_and_b32 s2, s2, s4
+; GFX10-DL-NEXT:    s_and_b32 s3, s3, s4
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s5, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
@@ -2621,15 +2621,15 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2705,22 +2705,22 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    flat_load_ushort v1, v[2:3]
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(0)
-; GFX8-NEXT:    v_bfe_i32 v1, v2, 0, 8
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfe_i32 v3, v0, 0, 8
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v2, s2
-; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v1, v0
+; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v1, s2
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -2732,23 +2732,23 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NODL-NEXT:    global_load_ushort v2, v[2:3], off
 ; GFX9-NODL-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NODL-NEXT:    global_load_ushort v1, v[2:3], off
+; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT:    v_bfe_i32 v1, v2, 0, 8
-; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_bfe_i32 v3, v0, 0, 8
 ; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX9-NODL-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, v0, v2, s2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v1, v0
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v0, v0, v1, s2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v2, v0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NODL-NEXT:    global_store_dword v[0:1], v2, off
@@ -2760,23 +2760,23 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-DL-NEXT:    global_load_ushort v2, v[2:3], off
 ; GFX9-DL-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-DL-NEXT:    global_load_ushort v1, v[2:3], off
+; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_i32 v1, v2, 0, 8
-; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_bfe_i32 v3, v0, 0, 8
 ; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX9-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v0, v0, v2, s2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, v3, v1, v0
+; GFX9-DL-NEXT:    v_mad_i32_i24 v0, v0, v1, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, v3, v2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 43ca2a400390..2a76184352a4 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -108,29 +108,29 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s4, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc32:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot4_i32_i8 v2, s3, s4, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot4_i32_i8 v2, s0, s1, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
@@ -226,21 +226,21 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i8 s1, s2
-; GFX8-NEXT:    s_bfe_i32 s3, s2, 0x80008
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x80010
+; GFX8-NEXT:    s_sext_i32_i8 s3, s2
+; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x80008
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x80010
 ; GFX8-NEXT:    s_sext_i32_i8 s1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x80008
-; GFX8-NEXT:    s_bfe_i32 s3, s0, 0x80010
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    s_bfe_i32 s6, s0, 0x80010
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v5, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
@@ -257,21 +257,21 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s2
-; GFX9-NODL-NEXT:    s_bfe_i32 s3, s2, 0x80008
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    s_bfe_i32 s5, s2, 0x80010
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s3, s2
+; GFX9-NODL-NEXT:    s_bfe_i32 s5, s2, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT:    s_bfe_i32 s7, s2, 0x80010
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NODL-NEXT:    s_bfe_i32 s4, s0, 0x80008
-; GFX9-NODL-NEXT:    s_bfe_i32 s3, s0, 0x80010
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-NODL-NEXT:    s_bfe_i32 s6, s0, 0x80010
 ; GFX9-NODL-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX9-NODL-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
 ; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s3, v5, v2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s6, v5, v2
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
@@ -282,15 +282,15 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -451,15 +451,15 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -928,30 +928,30 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sext_i32_i8 s1, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 8, s2
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    s_bfe_i32 s3, s2, 0x80010
-; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 8, s0
-; GFX8-NEXT:    s_sext_i32_i8 s1, s0
-; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 8, s0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 8, s1
+; GFX8-NEXT:    s_bfe_i32 s5, s1, 0x80010
+; GFX8-NEXT:    s_ashr_i32 s4, s1, 24
+; GFX8-NEXT:    s_sext_i32_i8 s1, s1
+; GFX8-NEXT:    s_ashr_i32 s2, s0, 24
+; GFX8-NEXT:    s_bfe_i32 s3, s0, 0x80010
+; GFX8-NEXT:    s_sext_i32_i8 s0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x80010
-; GFX8-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX8-NEXT:    v_mov_b32_e32 v6, s3
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 24
+; GFX8-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX8-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v4, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, v5, v3, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v6, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v5, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v4, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s3, v6, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1037,16 +1037,17 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: idot4_acc16_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_i32 s4, s0, 0x80000
 ; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x80000
@@ -1054,25 +1055,25 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 8, s0
 ; GFX10-DL-NEXT:    s_lshr_b32 s5, s1, 16
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 8, s1
-; GFX10-DL-NEXT:    v_and_b32_e32 v6, s3, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v7, s4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, s3, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, s4, v3
 ; GFX10-DL-NEXT:    s_bfe_i32 s0, s2, 0x80000
 ; GFX10-DL-NEXT:    s_bfe_i32 s1, s5, 0x80000
-; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 8, s2
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v7
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 8, s2
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, s1, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 8, s5
-; GFX10-DL-NEXT:    v_and_b32_e32 v8, s1, v2
-; GFX10-DL-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX10-DL-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v7, 16, v8
-; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v11, 16, v2
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v5
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v5
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v4, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v3, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v4, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 0ca56d31196c..06b758fa285d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -111,29 +111,29 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s4, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc32:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s3, s4, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s0, s1, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
@@ -278,15 +278,15 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -448,15 +448,15 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -608,24 +608,25 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot2_8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s3, s0, s2
-; GFX10-DL-NEXT:    s_and_b32 s2, s1, s2
+; GFX10-DL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_movk_i32 s1, 0xff
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s3, s2, s1
+; GFX10-DL-NEXT:    s_and_b32 s1, s0, s1
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s2, 0x80008
 ; GFX10-DL-NEXT:    s_bfe_u32 s0, s0, 0x80008
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s1, 0x80008
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s2, v2
 ; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                    <4 x i8> addrspace(1)* %src2,
@@ -752,14 +753,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s3, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -943,16 +944,17 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ;
 ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x80008
@@ -1138,26 +1140,26 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL:       ; %bb.0: ; %entry
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_movk_i32 s5, 0xff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_and_b32 s6, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s3, 0x80008
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s2, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s5, s7, v0
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s3, 0x80010
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x80010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s2, v0
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s3, 24
-; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 24
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s5, s7, v0
+; GFX10-DL-NEXT:    s_and_b32 s6, s2, s5
+; GFX10-DL-NEXT:    s_and_b32 s5, s3, s5
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s3, 0x80008
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s5, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s4, s7, v0
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s2, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s3, 0x80010
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s5, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s4, s7, v0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1345,25 +1347,25 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL:       ; %bb.0: ; %entry
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_movk_i32 s5, 0xff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s3, 0x80008
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x80008
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT:    s_and_b32 s8, s3, s2
-; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s2, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s3, 0x80008
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    s_and_b32 s8, s2, s5
+; GFX10-DL-NEXT:    s_and_b32 s5, s3, s5
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s6, s7, v0
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s3, 0x80010
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x80010
-; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s8, s2, v0
-; GFX10-DL-NEXT:    s_lshr_b32 s2, s3, 24
-; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 24
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, s5, v0
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s2, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s3, 0x80010
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s8, s5, v0
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, s4, v0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s6, s7, v1
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v1, s2, s3, v1
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
@@ -1458,21 +1460,21 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GFX8-NEXT:    s_sext_i32_i8 s3, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80010
-; GFX8-NEXT:    s_bfe_u32 s1, s0, 0x80008
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX8-NEXT:    s_sext_i32_i8 s1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s3
-; GFX8-NEXT:    s_sext_i32_i8 s4, s0
-; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
@@ -1489,21 +1491,21 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s3, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80010
-; GFX9-NODL-NEXT:    s_bfe_u32 s1, s0, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s4, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s3, s0, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s0, 0x80010
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v5, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
@@ -1520,21 +1522,21 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GFX9-DL-NEXT:    s_sext_i32_i8 s3, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x80010
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s0, 0x80008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX9-DL-NEXT:    s_sext_i32_i8 s1, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT:    s_sext_i32_i8 s4, s0
-; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x80010
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x80010
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
@@ -1542,15 +1544,16 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: notdot4_mixedtypes:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x80008
 ; GFX10-DL-NEXT:    s_bfe_u32 s3, s1, 0x80008
@@ -1742,29 +1745,29 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL:       ; %bb.0: ; %entry
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_movk_i32 s3, 0xff
-; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
-; GFX10-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_movk_i32 s5, 0xff
+; GFX10-DL-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT:    s_and_b32 s7, s4, s3
-; GFX10-DL-NEXT:    s_and_b32 s3, s5, s3
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-DL-NEXT:    v_and_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT:    v_and_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT:    s_bfe_u32 s2, s4, 0x80010
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s3, v2
-; GFX10-DL-NEXT:    s_bfe_u32 s3, s5, 0x80010
-; GFX10-DL-NEXT:    s_lshr_b32 s5, s5, 24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_and_b32 s7, s2, s5
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_and_b32 s5, s3, s5
+; GFX10-DL-NEXT:    v_and_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s2, 0x80010
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s5, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s3, 0x80010
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 24
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s2, s3, v0
-; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s4, s5, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v0, s4, s5, v0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
@@ -1933,38 +1936,39 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot4_acc16_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s0
-; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v5, 8, s1
-; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v7
 ; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
 ; GFX10-DL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
 ; GFX10-DL-NEXT:    v_lshl_or_b32 v5, s1, 16, v6
-; GFX10-DL-NEXT:    v_lshl_or_b32 v2, s0, 16, v2
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v5
+; GFX10-DL-NEXT:    v_lshl_or_b32 v3, s0, 16, v3
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v5
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v4, v3
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v3, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v4, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
@@ -2084,34 +2088,34 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v0, s2, v0
+; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX9-NODL-NEXT:    s_lshr_b32 s7, s3, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NODL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX9-NODL-NEXT:    s_lshr_b32 s5, s2, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v1, s4, v1
+; GFX9-NODL-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT:    v_or_b32_e32 v3, v2, v0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s2, 16
-; GFX9-NODL-NEXT:    s_lshr_b32 s3, s2, 24
-; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v3, s0, v3
-; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NODL-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v5, s2, v5
-; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX9-NODL-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT:    v_or_b32_e32 v4, v3, v4
-; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GFX9-NODL-NEXT:    global_load_ubyte v5, v[0:1], off
+; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NODL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NODL-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NODL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
@@ -2120,48 +2124,49 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v0, s2, v0
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX9-DL-NEXT:    s_lshr_b32 s7, s3, 24
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX9-DL-NEXT:    s_lshr_b32 s5, s2, 24
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v1, s4, v1
+; GFX9-DL-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_or_b32_e32 v3, v2, v0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-DL-NEXT:    s_lshr_b32 s1, s2, 16
-; GFX9-DL-NEXT:    s_lshr_b32 s3, s2, 24
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, s0, v3
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s0, 24
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, s2, v5
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX9-DL-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_e32 v4, v3, v4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GFX9-DL-NEXT:    global_load_ubyte v5, v[0:1], off
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc8_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v3, 8, s0
 ; GFX10-DL-NEXT:    v_lshrrev_b16_e64 v4, 8, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 13c243fcdd91..4ec5d77b64f7 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -156,29 +156,29 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_dot8_i32_i4 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-DL-NEXT:    v_dot8_i32_i4 v2, s4, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot8_acc32:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[8:9], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot8_i32_i4 v2, s4, s5, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_dot8_i32_i4 v2, s1, s2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
@@ -321,48 +321,48 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x40000
-; GFX8-NEXT:    s_bfe_i32 s4, s2, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_i32 s6, s0, 0x40000
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 12
-; GFX8-NEXT:    s_lshr_b32 s7, s2, 12
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    s_bfe_i32 s5, s0, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s6, s1, 0x40000
+; GFX8-NEXT:    s_bfe_i32 s8, s1, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s10, s1, 0x40008
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 12
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 12
+; GFX8-NEXT:    s_bfe_i32 s7, s0, 0x40004
 ; GFX8-NEXT:    s_bfe_i32 s9, s0, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v5, s4
-; GFX8-NEXT:    s_bfe_i32 s8, s0, 0x40004
-; GFX8-NEXT:    v_lshlrev_b16_e64 v6, 12, s1
-; GFX8-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
-; GFX8-NEXT:    v_mul_i32_i24_e32 v4, s9, v4
-; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x40010
-; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
-; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
-; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s1
-; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x40010
-; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x40018
-; GFX8-NEXT:    v_mov_b32_e32 v9, s5
-; GFX8-NEXT:    s_bfe_i32 s1, s0, 0x40014
-; GFX8-NEXT:    s_bfe_i32 s5, s0, 0x40018
-; GFX8-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX8-NEXT:    v_mov_b32_e32 v10, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mov_b32_e32 v7, s8
+; GFX8-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
+; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
+; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
+; GFX8-NEXT:    s_bfe_i32 s12, s1, 0x40010
+; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
+; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
+; GFX8-NEXT:    s_bfe_i32 s14, s1, 0x40014
+; GFX8-NEXT:    s_bfe_i32 s11, s0, 0x40010
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
+; GFX8-NEXT:    s_bfe_i32 s16, s1, 0x40018
+; GFX8-NEXT:    s_bfe_i32 s13, s0, 0x40014
+; GFX8-NEXT:    v_mov_b32_e32 v9, s14
+; GFX8-NEXT:    s_bfe_i32 s15, s0, 0x40018
+; GFX8-NEXT:    s_ashr_i32 s1, s1, 28
+; GFX8-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v5, v2
-; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX8-NEXT:    v_mad_u32_u24 v2, v6, v7, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v8, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s1, v9, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s5, v10, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
+; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX8-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -372,48 +372,48 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s1, s2, 0x40000
-; GFX9-NEXT:    s_bfe_i32 s4, s2, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_i32 s6, s0, 0x40000
-; GFX9-NEXT:    s_lshr_b32 s1, s0, 12
-; GFX9-NEXT:    s_lshr_b32 s7, s2, 12
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-NEXT:    s_bfe_i32 s5, s0, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s6, s1, 0x40000
+; GFX9-NEXT:    s_bfe_i32 s8, s1, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s10, s1, 0x40008
+; GFX9-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 12
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 12
+; GFX9-NEXT:    s_bfe_i32 s7, s0, 0x40004
 ; GFX9-NEXT:    s_bfe_i32 s9, s0, 0x40008
-; GFX9-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-NEXT:    s_bfe_i32 s8, s0, 0x40004
-; GFX9-NEXT:    v_lshlrev_b16_e64 v6, 12, s1
-; GFX9-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
-; GFX9-NEXT:    v_mul_i32_i24_e32 v4, s9, v4
-; GFX9-NEXT:    s_bfe_i32 s1, s2, 0x40010
-; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s1
-; GFX9-NEXT:    s_bfe_i32 s4, s0, 0x40010
-; GFX9-NEXT:    s_bfe_i32 s7, s2, 0x40018
-; GFX9-NEXT:    v_mov_b32_e32 v9, s5
-; GFX9-NEXT:    s_bfe_i32 s1, s0, 0x40014
-; GFX9-NEXT:    s_bfe_i32 s5, s0, 0x40018
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-NEXT:    v_mov_b32_e32 v10, s7
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mov_b32_e32 v7, s8
+; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
+; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
+; GFX9-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
+; GFX9-NEXT:    s_bfe_i32 s12, s1, 0x40010
+; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
+; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-NEXT:    s_bfe_i32 s14, s1, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s11, s0, 0x40010
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-NEXT:    s_bfe_i32 s16, s1, 0x40018
+; GFX9-NEXT:    s_bfe_i32 s13, s0, 0x40014
+; GFX9-NEXT:    v_mov_b32_e32 v9, s14
+; GFX9-NEXT:    s_bfe_i32 s15, s0, 0x40018
+; GFX9-NEXT:    s_ashr_i32 s1, s1, 28
+; GFX9-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s8, v5, v2
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NEXT:    v_mad_u32_u24 v2, v6, v7, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v8, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s1, v9, v2
-; GFX9-NEXT:    v_mad_i32_i24 v2, s5, v10, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
+; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -423,87 +423,88 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_i32 s1, s2, 0x40000
-; GFX9-DL-NEXT:    s_bfe_i32 s4, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
-; GFX9-DL-NEXT:    s_lshr_b32 s1, s0, 12
-; GFX9-DL-NEXT:    s_lshr_b32 s7, s2, 12
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-DL-NEXT:    s_bfe_i32 s5, s0, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s6, s1, 0x40000
+; GFX9-DL-NEXT:    s_bfe_i32 s8, s1, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s10, s1, 0x40008
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 12
+; GFX9-DL-NEXT:    s_lshr_b32 s4, s1, 12
+; GFX9-DL-NEXT:    s_bfe_i32 s7, s0, 0x40004
 ; GFX9-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s1
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s7
-; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v4, s9, v4
-; GFX9-DL-NEXT:    s_bfe_i32 s1, s2, 0x40010
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s1
-; GFX9-DL-NEXT:    s_bfe_i32 s4, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_i32 s7, s2, 0x40018
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s5
-; GFX9-DL-NEXT:    s_bfe_i32 s1, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_i32 s5, s0, 0x40018
-; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 28
-; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s7
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s8
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s2
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
+; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
+; GFX9-DL-NEXT:    s_bfe_i32 s12, s1, 0x40010
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
+; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-DL-NEXT:    s_bfe_i32 s14, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_i32 s11, s0, 0x40010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT:    s_bfe_i32 s16, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_i32 s13, s0, 0x40014
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s14
+; GFX9-DL-NEXT:    s_bfe_i32 s15, s0, 0x40018
+; GFX9-DL-NEXT:    s_ashr_i32 s1, s1, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX9-DL-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s6, v3, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s8, v5, v2
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v6, v7, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v8, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v9, v2
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s5, v10, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
+; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot8_acc16:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 12
-; GFX10-DL-NEXT:    s_lshr_b32 s5, s1, 12
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s5
-; GFX10-DL-NEXT:    s_bfe_i32 s9, s1, 0x40004
-; GFX10-DL-NEXT:    s_bfe_i32 s10, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_i32 s11, s1, 0x40008
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s1, 12
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s1, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s0, 0x40004
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s2
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s4
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s9, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
+; GFX10-DL-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
+; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s8, s9
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX10-DL-NEXT:    s_bfe_i32 s4, s1, 0x40010
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40014
-; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s10, s11
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s6, s7, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_i32 s6, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s8, s9, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s7, s2, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
@@ -814,39 +815,40 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: idot8_acc8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_lshr_b32 s4, s0, 12
-; GFX10-DL-NEXT:    s_lshr_b32 s5, s1, 12
-; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
-; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s4
-; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s5
-; GFX10-DL-NEXT:    s_bfe_i32 s9, s1, 0x40004
-; GFX10-DL-NEXT:    s_bfe_i32 s10, s0, 0x40008
-; GFX10-DL-NEXT:    s_bfe_i32 s11, s1, 0x40008
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s1, 12
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s1, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s0, 0x40004
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s2
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s4
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s9, s1, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
+; GFX10-DL-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
+; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s8, s9
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, s4, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX10-DL-NEXT:    s_bfe_i32 s4, s1, 0x40010
-; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40014
-; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v5, s10, s11
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s6, s7, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s0, 0x40014
 ; GFX10-DL-NEXT:    s_bfe_i32 s6, s1, 0x40014
-; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s8, s9, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s7, s2, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
@@ -1643,54 +1645,54 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s5, s7, 28
-; GFX8-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s9, s7, 24
-; GFX8-NEXT:    s_lshl_b32 s11, s7, 20
-; GFX8-NEXT:    s_lshl_b32 s5, s1, 28
-; GFX8-NEXT:    s_ashr_i64 s[14:15], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s5, s1, 20
-; GFX8-NEXT:    s_lshl_b32 s13, s1, 24
-; GFX8-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX8-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    s_ashr_i64 s[4:5], s[4:5], 60
-; GFX8-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NEXT:    s_lshl_b32 s29, s7, 28
+; GFX8-NEXT:    s_ashr_i64 s[18:19], s[6:7], 60
+; GFX8-NEXT:    s_lshl_b32 s21, s7, 8
+; GFX8-NEXT:    s_lshl_b32 s23, s7, 12
+; GFX8-NEXT:    s_lshl_b32 s17, s1, 28
+; GFX8-NEXT:    s_lshl_b32 s25, s7, 16
+; GFX8-NEXT:    s_lshl_b32 s27, s7, 24
+; GFX8-NEXT:    s_lshl_b32 s19, s7, 4
+; GFX8-NEXT:    s_lshl_b32 s7, s7, 20
+; GFX8-NEXT:    s_ashr_i64 s[4:5], s[0:1], 60
+; GFX8-NEXT:    s_ashr_i64 s[28:29], s[28:29], 60
+; GFX8-NEXT:    s_lshl_b32 s9, s1, 8
+; GFX8-NEXT:    s_lshl_b32 s11, s1, 12
+; GFX8-NEXT:    s_lshl_b32 s13, s1, 16
+; GFX8-NEXT:    s_lshl_b32 s15, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s5, s1, 4
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 20
+; GFX8-NEXT:    s_ashr_i64 s[26:27], s[26:27], 60
+; GFX8-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
+; GFX8-NEXT:    s_ashr_i64 s[16:17], s[16:17], 60
+; GFX8-NEXT:    v_mov_b32_e32 v4, s28
+; GFX8-NEXT:    s_ashr_i64 s[14:15], s[14:15], 60
+; GFX8-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s26
+; GFX8-NEXT:    s_ashr_i64 s[24:25], s[24:25], 60
+; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s0, v3
+; GFX8-NEXT:    s_ashr_i64 s[22:23], s[22:23], 60
 ; GFX8-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
-; GFX8-NEXT:    s_lshl_b32 s5, s7, 16
-; GFX8-NEXT:    v_mov_b32_e32 v5, s8
-; GFX8-NEXT:    s_lshl_b32 s9, s1, 16
-; GFX8-NEXT:    s_lshl_b32 s11, s7, 12
-; GFX8-NEXT:    s_ashr_i64 s[16:17], s[4:5], 60
-; GFX8-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
-; GFX8-NEXT:    v_mul_i32_i24_e32 v4, s4, v4
-; GFX8-NEXT:    s_lshl_b32 s5, s1, 12
-; GFX8-NEXT:    s_lshl_b32 s9, s7, 8
+; GFX8-NEXT:    v_mov_b32_e32 v6, s24
+; GFX8-NEXT:    s_ashr_i64 s[20:21], s[20:21], 60
 ; GFX8-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
-; GFX8-NEXT:    v_mov_b32_e32 v6, s16
-; GFX8-NEXT:    s_ashr_i64 s[20:21], s[8:9], 60
-; GFX8-NEXT:    s_lshl_b32 s13, s1, 8
-; GFX8-NEXT:    s_ashr_i64 s[18:19], s[4:5], 60
-; GFX8-NEXT:    s_lshl_b32 s5, s7, 4
-; GFX8-NEXT:    v_mov_b32_e32 v7, s10
-; GFX8-NEXT:    s_lshl_b32 s9, s1, 4
-; GFX8-NEXT:    s_ashr_i64 s[24:25], s[4:5], 60
-; GFX8-NEXT:    s_ashr_i64 s[22:23], s[12:13], 60
+; GFX8-NEXT:    v_mov_b32_e32 v7, s22
+; GFX8-NEXT:    s_ashr_i64 s[32:33], s[18:19], 60
+; GFX8-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s20
-; GFX8-NEXT:    s_ashr_i64 s[6:7], s[6:7], 60
-; GFX8-NEXT:    s_ashr_i64 s[26:27], s[8:9], 60
-; GFX8-NEXT:    v_mov_b32_e32 v9, s24
-; GFX8-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
+; GFX8-NEXT:    s_ashr_i64 s[30:31], s[4:5], 60
+; GFX8-NEXT:    v_mov_b32_e32 v9, s32
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v3, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v5, v2
-; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v6, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s18, v7, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s22, v8, v2
-; GFX8-NEXT:    v_mad_i32_i24 v2, s26, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s16, v4, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s14, v5, v2
+; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX8-NEXT:    v_mad_i32_i24 v2, s12, v6, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s10, v7, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s8, v8, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s30, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1702,17 +1704,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s4, s2, 15
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s12, s2, 28
-; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40014
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s10, s2
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x4000c
+; GFX9-NEXT:    s_and_b32 s12, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40004
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s10, s11
+; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s8, s9
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s11, s12
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s5
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x40018
 ; GFX9-NEXT:    s_lshr_b32 s13, s6, 28
 ; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
@@ -1722,14 +1728,10 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_and_b32 s18, s6, 15
 ; GFX9-NEXT:    s_bfe_u32 s6, s6, 0x40004
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s18, s6
-; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s16, s17
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s14, s15
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
@@ -1769,17 +1771,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s4, s2, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s12, s2, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40014
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s10, s2
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s5, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x4000c
+; GFX9-DL-NEXT:    s_and_b32 s12, s2, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40004
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s10, s11
+; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s8, s9
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s11, s12
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s4, s5
 ; GFX9-DL-NEXT:    s_bfe_u32 s7, s6, 0x40018
 ; GFX9-DL-NEXT:    s_lshr_b32 s13, s6, 28
 ; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40010
@@ -1789,14 +1795,10 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_and_b32 s18, s6, 15
 ; GFX9-DL-NEXT:    s_bfe_u32 s6, s6, 0x40004
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
 ; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s18, s6
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
 ; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s16, s17
 ; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s14, s15
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
 ; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
@@ -1830,13 +1832,14 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: idot8_acc16_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2273,14 +2276,15 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: idot8_acc8_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index ae4086adb826..543b55e8e261 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -156,29 +156,29 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc32:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[8:9], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s4, s5, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s1, s2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
@@ -304,46 +304,46 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s1, s2, 15
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s16, s1, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX8-NEXT:    s_and_b32 s0, s0, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_and_b32 s1, s0, 15
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v5, s4
-; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v6, s7
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v7, s8
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s9
-; GFX8-NEXT:    s_bfe_u32 s12, s0, 0x40018
-; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_mov_b32_e32 v7, s13
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s11
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -352,46 +352,46 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s16, s1, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-NEXT:    s_and_b32 s1, s1, 15
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    s_and_b32 s1, s0, 15
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-NEXT:    s_bfe_u32 s12, s0, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s11
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -400,58 +400,59 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s1, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT:    s_and_b32 s1, s1, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX9-DL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT:    s_and_b32 s1, s0, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s0, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
-; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc16:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -606,46 +607,46 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s1, s2, 15
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s16, s1, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX8-NEXT:    s_and_b32 s0, s0, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_and_b32 s1, s0, 15
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v5, s4
-; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v6, s7
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v7, s8
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s9
-; GFX8-NEXT:    s_bfe_u32 s12, s0, 0x40018
-; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_mov_b32_e32 v7, s13
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s11
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -654,46 +655,46 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s16, s1, 0x40004
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-NEXT:    s_and_b32 s1, s1, 15
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    s_and_b32 s1, s0, 15
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-NEXT:    s_bfe_u32 s12, s0, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s11
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -702,58 +703,59 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s1, 0x40004
+; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT:    s_and_b32 s1, s1, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX9-DL-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT:    s_and_b32 s1, s0, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s0, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
-; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -909,48 +911,48 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s1, s2, 15
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT:    s_and_b32 s9, s0, 15
+; GFX8-NEXT:    s_and_b32 s16, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX8-NEXT:    s_and_b32 s1, s0, 15
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v5, s7
-; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40010
-; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v7, s6
-; GFX8-NEXT:    s_lshr_b32 s11, s2, 28
-; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s8
-; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40018
-; GFX8-NEXT:    v_mov_b32_e32 v9, s2
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s13
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
+; GFX8-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -960,48 +962,48 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT:    s_and_b32 s9, s0, 15
+; GFX9-NEXT:    s_and_b32 s16, s1, 15
+; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-NEXT:    s_and_b32 s1, s0, 15
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX9-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40010
-; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v7, s6
-; GFX9-NEXT:    s_lshr_b32 s11, s2, 28
-; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40018
-; GFX9-NEXT:    v_mov_b32_e32 v9, s2
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -1011,61 +1013,62 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT:    s_and_b32 s9, s0, 15
+; GFX9-DL-NEXT:    s_and_b32 s16, s1, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-DL-NEXT:    s_and_b32 s1, s0, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40010
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s6
-; GFX9-DL-NEXT:    s_lshr_b32 s11, s2, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40018
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s2
-; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc4:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1208,48 +1211,48 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s1, s2, 15
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT:    s_and_b32 s9, s0, 15
+; GFX8-NEXT:    s_and_b32 s16, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX8-NEXT:    s_and_b32 s1, s0, 15
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v5, s7
-; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40010
-; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v7, s6
-; GFX8-NEXT:    s_lshr_b32 s11, s2, 28
-; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s8
-; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40018
-; GFX8-NEXT:    v_mov_b32_e32 v9, s2
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s13
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
+; GFX8-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -1259,48 +1262,48 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT:    s_and_b32 s9, s0, 15
+; GFX9-NEXT:    s_and_b32 s16, s1, 15
+; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-NEXT:    s_and_b32 s1, s0, 15
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX9-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40010
-; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v7, s6
-; GFX9-NEXT:    s_lshr_b32 s11, s2, 28
-; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40018
-; GFX9-NEXT:    v_mov_b32_e32 v9, s2
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -1310,61 +1313,62 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT:    s_and_b32 s9, s0, 15
+; GFX9-DL-NEXT:    s_and_b32 s16, s1, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-DL-NEXT:    s_and_b32 s1, s0, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40010
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s6
-; GFX9-DL-NEXT:    s_lshr_b32 s11, s2, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40018
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s2
-; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1909,29 +1913,29 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v0, v1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s4, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc32_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[8:9], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s4, s5, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s1, s2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
@@ -2032,46 +2036,46 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s1, s2, 15
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s16, s1, 0x40004
+; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40004
+; GFX8-NEXT:    s_and_b32 s0, s0, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_and_b32 s1, s0, 15
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v5, s4
-; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40008
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v6, s7
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v7, s8
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s9
-; GFX8-NEXT:    s_bfe_u32 s12, s0, 0x40018
-; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_mov_b32_e32 v7, s13
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s11
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2080,53 +2084,53 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s13, s2, 28
-; GFX9-NEXT:    s_and_b32 s4, s0, 15
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s13, s6, 28
+; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s7, s13
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 28
+; GFX9-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s15, s6, 0x40014
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_pack_ll_b32_b16 s11, s11, s13
-; GFX9-NEXT:    s_bfe_u32 s1, s0, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s12, s0, 0x40018
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, s4, v0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s14, s15
 ; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40014
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s12, s0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-NEXT:    v_pk_mul_lo_u16 v4, s0, v4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s8, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s6, s7
-; GFX9-NEXT:    v_pk_mul_lo_u16 v3, s4, v3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v6, s0
-; GFX9-NEXT:    v_pk_mul_lo_u16 v6, s1, v6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
-; GFX9-NEXT:    v_pk_mul_lo_u16 v5, s2, v5
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s16, s6, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s17, s6, 0x4000c
+; GFX9-NEXT:    s_and_b32 s18, s6, 15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s8, s9
+; GFX9-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s6, s6, 0x40004
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s16, s17
+; GFX9-NEXT:    v_pk_mul_lo_u16 v3, s5, v0
+; GFX9-NEXT:    s_and_b32 s12, s2, 15
+; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s10, s11
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s18, s6
+; GFX9-NEXT:    v_pk_mul_lo_u16 v4, s5, v0
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_pk_mul_lo_u16 v5, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_e32 v6, v5, v6
+; GFX9-NEXT:    v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT:    v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2135,65 +2139,66 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s13, s2, 28
-; GFX9-DL-NEXT:    s_and_b32 s4, s0, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s6, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s13, s6, 28
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s7, s7, s13
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40018
+; GFX9-DL-NEXT:    s_lshr_b32 s5, s2, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s6, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s6, 0x40014
 ; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s11, s11, s13
-; GFX9-DL-NEXT:    s_bfe_u32 s1, s0, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s12, s0, 0x40018
-; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, s4, v0
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s14, s15
 ; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
-; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40014
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s12, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, s0, v4
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s8, s2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s0, s6, s7
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, s4, v3
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s0
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v6, s1, v6
-; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s9, s10
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, s2, v5
+; GFX9-DL-NEXT:    s_bfe_u32 s9, s2, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s16, s6, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s17, s6, 0x4000c
+; GFX9-DL-NEXT:    s_and_b32 s18, s6, 15
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s5, s8, s9
+; GFX9-DL-NEXT:    s_bfe_u32 s10, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s2, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s6, 0x40004
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s16, s17
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, s5, v0
+; GFX9-DL-NEXT:    s_and_b32 s12, s2, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s5, s10, s11
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s4, s18, s6
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, s5, v0
+; GFX9-DL-NEXT:    s_pack_ll_b32_b16 s2, s12, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, s2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    global_load_ushort v6, v[0:1], off
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_e32 v6, v5, v6
+; GFX9-DL-NEXT:    v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT:    v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-DL-NEXT:    v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc16_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2548,45 +2553,46 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ;
 ; GFX10-DL-LABEL: udot8_acc8_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40004
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40004
-; GFX10-DL-NEXT:    s_and_b32 s6, s0, 15
-; GFX10-DL-NEXT:    s_and_b32 s8, s1, 15
-; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x4000c
-; GFX10-DL-NEXT:    s_bfe_u32 s9, s1, 0x4000c
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s4, s5
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s6, s8
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s1, 0x40008
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s7, s9
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s5, s0, 15
+; GFX10-DL-NEXT:    s_and_b32 s7, s1, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x4000c
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, s2, s4
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40008
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s5, s7
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s1, 0x40008
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s6, s8
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
+; GFX10-DL-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX10-DL-NEXT:    s_bfe_u32 s7, s1, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40018
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s4, s5
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s2, s4
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v5
 ; GFX10-DL-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX10-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
-; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40010
-; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40018
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, s5, v3
 ; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40010
 ; GFX10-DL-NEXT:    s_lshr_b32 s9, s1, 28
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s5, s7
-; GFX10-DL-NEXT:    s_bfe_u32 s1, s1, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s4, s7
 ; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v4
-; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s4, s8
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s1, 0x40018
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s2, s8
 ; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, s0, s9
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
 ; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 8, v7
@@ -2598,7 +2604,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_or_b32_e32 v3, v6, v3
 ; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, s5, v3
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX10-DL-NEXT:    v_or_b32_e32 v4, v3, v5
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
@@ -2696,48 +2702,48 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
 ; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s1, s2, 15
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT:    s_and_b32 s9, s0, 15
+; GFX8-NEXT:    s_and_b32 s16, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX8-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX8-NEXT:    s_and_b32 s1, s0, 15
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT:    v_mov_b32_e32 v5, s7
-; GFX8-NEXT:    s_bfe_u32 s8, s0, 0x4000c
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x40010
-; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT:    v_mov_b32_e32 v7, s6
-; GFX8-NEXT:    s_lshr_b32 s11, s2, 28
-; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX8-NEXT:    v_mov_b32_e32 v8, s8
-; GFX8-NEXT:    s_bfe_u32 s10, s0, 0x40018
-; GFX8-NEXT:    v_mov_b32_e32 v9, s2
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX8-NEXT:    v_mov_b32_e32 v6, s14
+; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s13
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
+; GFX8-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -2747,48 +2753,48 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT:    s_and_b32 s9, s0, 15
+; GFX9-NEXT:    s_and_b32 s16, s1, 15
+; GFX9-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-NEXT:    s_and_b32 s1, s0, 15
-; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX9-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NEXT:    s_bfe_u32 s8, s0, 0x4000c
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX9-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-NEXT:    s_bfe_u32 s6, s2, 0x40010
-; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX9-NEXT:    v_mov_b32_e32 v7, s6
-; GFX9-NEXT:    s_lshr_b32 s11, s2, 28
-; GFX9-NEXT:    s_bfe_u32 s2, s2, 0x40018
-; GFX9-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX9-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-NEXT:    s_bfe_u32 s10, s0, 0x40018
-; GFX9-NEXT:    v_mov_b32_e32 v9, s2
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX9-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -2798,61 +2804,62 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT:    s_and_b32 s9, s0, 15
+; GFX9-DL-NEXT:    s_and_b32 s16, s1, 15
+; GFX9-DL-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s13, s1, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
+; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x40004
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40018
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40008
-; GFX9-DL-NEXT:    s_and_b32 s1, s0, 15
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40004
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s0, 0x4000c
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40008
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s2, 0x40010
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v5
-; GFX9-DL-NEXT:    s_bfe_u32 s8, s2, 0x40014
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s0, 0x40010
-; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s6
-; GFX9-DL-NEXT:    s_lshr_b32 s11, s2, 28
-; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x40018
-; GFX9-DL-NEXT:    s_bfe_u32 s9, s0, 0x40014
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-DL-NEXT:    s_bfe_u32 s10, s0, 0x40018
-; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s2
-; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 28
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v6, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot8_acc4_vecMul:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3078,18 +3085,18 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ;
 ; GFX10-DL-LABEL: udot8_variant1:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s4, s3, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s1, s0, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-DL-NEXT:    s_endpgm
                                           i32 addrspace(1)* %v2addr,

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 627f68913866..341b3942aa36 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -8,11 +8,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s0, 0x3e7, s2
+; GFX9-NEXT:    s_pack_lh_b32_b16 s0, 0x3e7, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -21,11 +21,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff0000
+; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff0000
 ; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e7
 ; CIVI-NEXT:    v_mov_b32_e32 v2, s0
 ; CIVI-NEXT:    flat_store_dword v[0:1], v2
@@ -43,11 +43,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s4, s2
+; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s4, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -57,13 +57,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_and_b32 s0, s4, 0xffff
+; VI-NEXT:    s_and_b32 s1, s4, 0xffff
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s1, s2, 0xffff0000
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT:    s_or_b32 s0, s1, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -73,12 +73,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_and_b32 s1, s4, 0xffff
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s0, s2, 0xffff0000
+; CI-NEXT:    s_and_b32 s0, s0, 0xffff0000
 ; CI-NEXT:    s_or_b32 s0, s1, s0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -95,11 +95,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s4, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -113,18 +113,18 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_and_b32 s0, s4, 0xffff
+; VI-NEXT:    s_and_b32 s1, s4, 0xffff
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff0000
-; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_lshr_b32 s2, s0, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT:    s_or_b32 s0, s1, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    ;;#ASMSTART
-; VI-NEXT:    ; use s1
+; VI-NEXT:    ; use s2
 ; VI-NEXT:    ;;#ASMEND
 ; VI-NEXT:    s_endpgm
 ;
@@ -133,18 +133,18 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_and_b32 s0, s4, 0xffff
+; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_and_b32 s1, s4, 0xffff
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s1, s2, 16
-; CI-NEXT:    s_lshl_b32 s2, s1, 16
-; CI-NEXT:    s_or_b32 s0, s0, s2
-; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    s_lshr_b32 s0, s0, 16
+; CI-NEXT:    s_lshl_b32 s2, s0, 16
+; CI-NEXT:    s_or_b32 s1, s1, s2
+; CI-NEXT:    v_mov_b32_e32 v2, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    ;;#ASMSTART
-; CI-NEXT:    ; use s1
+; CI-NEXT:    ; use s0
 ; CI-NEXT:    ;;#ASMEND
 ; CI-NEXT:    s_endpgm
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
@@ -162,11 +162,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_hh_b32_b16 s0, s4, s2
+; GFX9-NEXT:    s_pack_hh_b32_b16 s0, s4, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -176,13 +176,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_lshr_b32 s0, s4, 16
+; VI-NEXT:    s_lshr_b32 s1, s4, 16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s1, s2, 0xffff0000
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT:    s_or_b32 s0, s1, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -192,12 +192,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_lshr_b32 s1, s4, 16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s0, s2, 0xffff0000
+; CI-NEXT:    s_and_b32 s0, s0, 0xffff0000
 ; CI-NEXT:    s_or_b32 s0, s1, s0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -216,16 +216,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s1, s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s0
+; GFX9-NEXT:    ; use s1
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -234,17 +234,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_lshr_b32 s0, s4, 16
+; VI-NEXT:    s_lshr_b32 s1, s4, 16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s1, s2, 0xffff0000
-; VI-NEXT:    s_or_b32 s1, s0, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    ;;#ASMSTART
-; VI-NEXT:    ; use s0
+; VI-NEXT:    ; use s1
 ; VI-NEXT:    ;;#ASMEND
 ; VI-NEXT:    s_endpgm
 ;
@@ -253,17 +253,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    s_lshr_b32 s0, s4, 16
+; CI-NEXT:    s_lshr_b32 s1, s4, 16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s1, s2, 0xffff0000
-; CI-NEXT:    s_or_b32 s1, s0, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s1
+; CI-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; CI-NEXT:    s_or_b32 s0, s1, s0
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    ;;#ASMSTART
-; CI-NEXT:    ; use s0
+; CI-NEXT:    ; use s1
 ; CI-NEXT:    ;;#ASMEND
 ; CI-NEXT:    s_endpgm
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
@@ -282,20 +282,20 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s1, s2, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s0, s1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s0
+; GFX9-NEXT:    ; use s1
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s1
+; GFX9-NEXT:    ; use s0
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -304,21 +304,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_lshr_b32 s0, s4, 16
+; VI-NEXT:    s_lshr_b32 s1, s4, 16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff0000
-; VI-NEXT:    s_or_b32 s2, s0, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_lshr_b32 s2, s0, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    ;;#ASMSTART
-; VI-NEXT:    ; use s0
+; VI-NEXT:    ; use s1
 ; VI-NEXT:    ;;#ASMEND
 ; VI-NEXT:    ;;#ASMSTART
-; VI-NEXT:    ; use s1
+; VI-NEXT:    ; use s2
 ; VI-NEXT:    ;;#ASMEND
 ; VI-NEXT:    s_endpgm
 ;
@@ -327,21 +327,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 s0, s4, 16
+; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_lshr_b32 s1, s4, 16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s1, s2, 16
-; CI-NEXT:    s_lshl_b32 s2, s1, 16
-; CI-NEXT:    s_or_b32 s2, s0, s2
+; CI-NEXT:    s_lshr_b32 s0, s0, 16
+; CI-NEXT:    s_lshl_b32 s2, s0, 16
+; CI-NEXT:    s_or_b32 s2, s1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    ;;#ASMSTART
-; CI-NEXT:    ; use s0
+; CI-NEXT:    ; use s1
 ; CI-NEXT:    ;;#ASMEND
 ; CI-NEXT:    ;;#ASMSTART
-; CI-NEXT:    ; use s1
+; CI-NEXT:    ; use s0
 ; CI-NEXT:    ;;#ASMEND
 ; CI-NEXT:    s_endpgm
   %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
@@ -363,11 +363,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s2, 0x3e7
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, 0x3e7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -376,11 +376,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff
+; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e70000
 ; CIVI-NEXT:    v_mov_b32_e32 v2, s0
 ; CIVI-NEXT:    flat_store_dword v[0:1], v2
@@ -397,11 +397,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s2, s4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -411,13 +411,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_lshl_b32 s0, s4, 16
+; VI-NEXT:    s_lshl_b32 s1, s4, 16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s1, s2, 0xffff
-; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -427,12 +427,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_lshl_b32 s1, s4, 16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s0, s2, 0xffff
+; CI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; CI-NEXT:    s_or_b32 s0, s0, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -448,11 +448,11 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, 0x4500, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -462,11 +462,11 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff0000
+; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff0000
 ; CIVI-NEXT:    s_or_b32 s0, s0, 0x4500
 ; CIVI-NEXT:    v_mov_b32_e32 v2, s0
 ; CIVI-NEXT:    flat_store_dword v[0:1], v2
@@ -482,11 +482,11 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s2, 0x4500
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, 0x4500
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -495,11 +495,11 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
-; CIVI-NEXT:    s_and_b32 s0, s2, 0xffff
+; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; CIVI-NEXT:    s_or_b32 s0, s0, 0x45000000
 ; CIVI-NEXT:    v_mov_b32_e32 v2, s0
 ; CIVI-NEXT:    flat_store_dword v[0:1], v2
@@ -515,19 +515,19 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x3e7
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_movk_i32 s0, 0x3e7
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfi_b32 v2, v3, s4, v4
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_bfi_b32 v0, v1, s0, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2i16_0:
@@ -538,14 +538,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_or_b32_e32 v2, 0x3e7, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT:    v_or_b32_e32 v0, 0x3e7, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2i16_0:
@@ -556,14 +556,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; CI-NEXT:    v_or_b32_e32 v2, 0x3e7, v2
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT:    v_or_b32_e32 v0, 0x3e7, v0
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -581,19 +581,19 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff0000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 16, s4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_lshrrev_b32_e64 v1, 16, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v2, v4, v3, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v1
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2i16_0_reghi:
@@ -605,15 +605,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    s_lshr_b32 s1, s4, 16
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_lshr_b32 s0, s4, 16
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_or_b32_e32 v2, s1, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT:    v_or_b32_e32 v0, s0, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2i16_0_reghi:
@@ -625,15 +625,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    s_lshr_b32 s1, s4, 16
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_lshr_b32 s0, s4, 16
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; CI-NEXT:    v_or_b32_e32 v2, s1, v2
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT:    v_or_b32_e32 v0, s0, v0
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -652,18 +652,18 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfi_b32 v2, v3, 53, v4
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_bfi_b32 v0, v1, 53, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
@@ -674,14 +674,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_or_b32_e32 v2, 53, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT:    v_or_b32_e32 v0, 53, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
@@ -692,14 +692,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; CI-NEXT:    v_or_b32_e32 v2, 53, v2
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT:    v_or_b32_e32 v0, 53, v0
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -717,37 +717,37 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x3e7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_movk_i32 s0, 0x3e7
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT:    v_lshl_or_b32 v2, s4, 16, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2i16_1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0x3e70000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3e70000
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2i16_1:
@@ -758,14 +758,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; CI-NEXT:    v_or_b32_e32 v2, 0x3e70000, v2
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_or_b32_e32 v0, 0x3e70000, v0
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -786,32 +786,32 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT:    v_lshl_or_b32 v2, -15, 16, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, -15, 16, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0xfff10000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0xfff10000
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
@@ -822,14 +822,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; CI-NEXT:    v_or_b32_e32 v2, 0xfff10000, v2
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_or_b32_e32 v0, 0xfff10000, v0
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -846,19 +846,19 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4500
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4500
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2f16_0:
@@ -869,14 +869,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_or_b32_e32 v2, 0x4500, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT:    v_or_b32_e32 v0, 0x4500, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2f16_0:
@@ -887,14 +887,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; CI-NEXT:    v_or_b32_e32 v2, 0x4500, v2
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT:    v_or_b32_e32 v0, 0x4500, v0
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -915,14 +915,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, 53
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, 53
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
@@ -933,14 +933,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_or_b32_e32 v2, 53, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT:    v_or_b32_e32 v0, 53, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
@@ -951,14 +951,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; CI-NEXT:    v_or_b32_e32 v2, 53, v2
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT:    v_or_b32_e32 v0, 53, v0
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -975,37 +975,37 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x4500
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_movk_i32 s0, 0x4500
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT:    v_lshl_or_b32 v2, s4, 16, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2f16_1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0x45000000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0x45000000
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2f16_1:
@@ -1016,14 +1016,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; CI-NEXT:    v_or_b32_e32 v2, 0x45000000, v2
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_or_b32_e32 v0, 0x45000000, v0
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1044,32 +1044,32 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT:    v_lshl_or_b32 v2, 35, 16, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, 35, 16, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0x230000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0x230000
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
@@ -1080,14 +1080,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; CI-NEXT:    v_or_b32_e32 v2, 0x230000, v2
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_or_b32_e32 v0, 0x230000, v0
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1167,20 +1167,20 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3e703e7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    s_lshl_b32 s2, s4, 4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    s_lshl_b32 s0, s4, 4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3e703e7
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfi_b32 v2, s0, v3, v4
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_bfi_b32 v0, s0, v1, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
@@ -1188,20 +1188,20 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0x3e703e7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    s_lshl_b32 s2, s4, 4
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_lshl_b32 s0, 0xffff, s2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    s_lshl_b32 s0, s4, 4
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3e703e7
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_bfi_b32 v2, s0, v3, v4
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_bfi_b32 v0, s0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
@@ -1209,20 +1209,20 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, 0x3e703e7
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v4, v[0:1]
-; CI-NEXT:    s_lshl_b32 s2, s4, 4
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    s_lshl_b32 s0, 0xffff, s2
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT:    s_lshl_b32 s0, s4, 4
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; CI-NEXT:    v_mov_b32_e32 v1, 0x3e703e7
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_bfi_b32 v2, s0, v3, v4
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_bfi_b32 v0, s0, v1, v0
+; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1239,80 +1239,78 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, 0xffff
-; GFX9-NEXT:    s_mov_b32 s7, 0x12341234
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v4
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v4
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfi_b32 v2, v2, s7, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x12341234
+; GFX9-NEXT:    v_bfi_b32 v0, v1, s0, v0
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s6, 0xffff
-; VI-NEXT:    s_mov_b32 s7, 0x12341234
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v4
-; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT:    s_mov_b32 s0, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_bfi_b32 v2, v2, s7, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; VI-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; VI-NEXT:    s_mov_b32 s0, 0x12341234
+; VI-NEXT:    v_bfi_b32 v0, v1, s0, v0
+; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s6, 0x12341234
-; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    flat_load_dword v4, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; CI-NEXT:    flat_load_dword v3, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s5
+; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT:    flat_load_dword v2, v[2:3]
+; CI-NEXT:    flat_load_dword v0, v[0:1]
+; CI-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
+; CI-NEXT:    v_mov_b32_e32 v5, s1
+; CI-NEXT:    s_mov_b32 s0, 0x12341234
+; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 4, v4
-; CI-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
+; CI-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_bfi_b32 v2, v2, s6, v3
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_bfi_b32 v0, v1, s0, v0
+; CI-NEXT:    flat_store_dword v[4:5], v0
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1356,13 +1354,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    s_and_b32 s1, s4, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_and_b32 s0, s4, 0xffff
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; VI-NEXT:    v_or_b32_e32 v0, s1, v0
+; VI-NEXT:    v_or_b32_e32 v0, s0, v0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1376,13 +1374,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT:    s_and_b32 s1, s4, 0xffff
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_and_b32 s0, s4, 0xffff
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT:    v_or_b32_e32 v0, s1, v0
+; CI-NEXT:    v_or_b32_e32 v0, s0, v0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1423,14 +1421,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_lshl_b32 s2, s4, 16
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    s_lshl_b32 s0, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -1447,13 +1445,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT:    s_lshl_b32 s1, s4, 16
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_lshl_b32 s0, s4, 16
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT:    v_or_b32_e32 v0, s1, v0
+; CI-NEXT:    v_or_b32_e32 v0, s0, v0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1498,13 +1496,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    s_and_b32 s1, s4, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_and_b32 s0, s4, 0xffff
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT:    v_or_b32_e32 v1, s1, v1
+; VI-NEXT:    v_or_b32_e32 v1, s0, v1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1518,13 +1516,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT:    s_and_b32 s1, s4, 0xffff
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_and_b32 s0, s4, 0xffff
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT:    v_or_b32_e32 v1, s1, v1
+; CI-NEXT:    v_or_b32_e32 v1, s0, v1
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1565,14 +1563,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_lshl_b32 s2, s4, 16
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    s_lshl_b32 s0, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -1589,13 +1587,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT:    s_lshl_b32 s1, s4, 16
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_lshl_b32 s0, s4, 16
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; CI-NEXT:    v_or_b32_e32 v1, s1, v1
+; CI-NEXT:    v_or_b32_e32 v1, s0, v1
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1640,13 +1638,13 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    s_and_b32 s1, s4, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_and_b32 s0, s4, 0xffff
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT:    v_or_b32_e32 v1, s1, v1
+; VI-NEXT:    v_or_b32_e32 v1, s0, v1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1660,13 +1658,13 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT:    s_and_b32 s1, s4, 0xffff
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_and_b32 s0, s4, 0xffff
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT:    v_or_b32_e32 v1, s1, v1
+; CI-NEXT:    v_or_b32_e32 v1, s0, v1
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1686,51 +1684,51 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s6, s6
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v4, s[4:5]
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v4, s[0:1]
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfi_b32 v1, v5, s1, v1
-; GFX9-NEXT:    v_bfi_b32 v0, v4, s1, v0
+; GFX9-NEXT:    v_bfi_b32 v1, v5, s0, v1
+; GFX9-NEXT:    v_bfi_b32 v0, v4, s0, v0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    flat_load_dword v4, v[0:1]
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_mov_b32 s4, 0xffff
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_mov_b32 s5, 0
-; VI-NEXT:    s_and_b32 s1, s6, s4
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    s_lshl_b32 s0, s1, 16
-; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_mov_b32 s0, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_and_b32 s2, s4, s0
+; VI-NEXT:    s_mov_b32 s1, 0
+; VI-NEXT:    s_lshl_b32 s3, s2, 16
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
-; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, s[4:5]
+; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, s[0:1]
+; VI-NEXT:    s_or_b32 s0, s2, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_bfi_b32 v1, v5, s0, v1
 ; VI-NEXT:    v_bfi_b32 v0, v4, s0, v0
@@ -1739,29 +1737,29 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ;
 ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    s_load_dword s6, s[4:5], 0x4
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_mov_b32 s4, 0xffff
+; CI-NEXT:    s_mov_b32 s6, 0xffff
+; CI-NEXT:    s_mov_b32 s7, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    s_mov_b32 s5, 0
-; CI-NEXT:    s_lshl_b32 s2, s6, 16
-; CI-NEXT:    s_and_b32 s3, s6, s4
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_lshl_b32 s1, s4, 16
+; CI-NEXT:    s_and_b32 s3, s4, s6
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT:    s_or_b32 s1, s3, s2
+; CI-NEXT:    s_or_b32 s0, s3, s1
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
-; CI-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
+; CI-NEXT:    v_lshl_b64 v[4:5], s[6:7], v4
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_bfi_b32 v1, v5, s1, v1
-; CI-NEXT:    v_bfi_b32 v0, v4, s1, v0
+; CI-NEXT:    v_bfi_b32 v1, v5, s0, v1
+; CI-NEXT:    v_bfi_b32 v0, v4, s0, v0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1783,24 +1781,24 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX9-NEXT:    s_mov_b32 s7, 0
-; GFX9-NEXT:    s_mov_b32 s6, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s4
-; GFX9-NEXT:    s_lshl_b32 s2, s5, 4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s4, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[6:7], s2
-; GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s3, s5, 4
+; GFX9-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfi_b32 v1, s1, v4, v1
-; GFX9-NEXT:    v_bfi_b32 v0, s0, v5, v0
+; GFX9-NEXT:    v_bfi_b32 v1, s1, v5, v1
+; GFX9-NEXT:    v_bfi_b32 v0, s0, v4, v0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1809,20 +1807,20 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_mov_b32 s6, 0xffff
-; VI-NEXT:    s_mov_b32 s7, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_and_b32 s2, s4, s6
-; VI-NEXT:    s_lshl_b32 s3, s2, 16
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_lshl_b32 s4, s5, 4
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    s_lshl_b64 s[0:1], s[6:7], s4
+; VI-NEXT:    s_mov_b32 s0, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_mov_b32 s1, 0
+; VI-NEXT:    s_lshl_b32 s2, s5, 4
+; VI-NEXT:    s_and_b32 s3, s4, s0
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; VI-NEXT:    s_lshl_b32 s2, s3, 16
+; VI-NEXT:    s_or_b32 s2, s3, s2
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    v_mov_b32_e32 v5, s2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1837,20 +1835,20 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_mov_b32 s6, 0xffff
-; CI-NEXT:    s_mov_b32 s7, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    s_and_b32 s2, s4, s6
-; CI-NEXT:    s_lshl_b32 s3, s4, 16
-; CI-NEXT:    s_or_b32 s2, s2, s3
-; CI-NEXT:    s_lshl_b32 s4, s5, 4
-; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT:    s_lshl_b64 s[0:1], s[6:7], s4
+; CI-NEXT:    s_mov_b32 s0, 0xffff
+; CI-NEXT:    s_and_b32 s2, s4, s0
+; CI-NEXT:    s_lshl_b32 s4, s4, 16
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_or_b32 s2, s2, s4
+; CI-NEXT:    s_mov_b32 s1, 0
+; CI-NEXT:    s_lshl_b32 s3, s5, 4
+; CI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
 ; CI-NEXT:    v_mov_b32_e32 v4, s2
 ; CI-NEXT:    v_mov_b32_e32 v5, s2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 90d7b693b601..614a2b3d6d02 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -21,10 +21,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s0, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s0, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -33,10 +33,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, s0, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
@@ -62,8 +62,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, s0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -73,8 +73,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, s0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
@@ -123,14 +123,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, v5, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
+; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32:
@@ -145,14 +145,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, v5, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -191,13 +191,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, v3, 1.0
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, 1.0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
@@ -208,13 +208,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, v3, 1.0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, 1.0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -251,13 +251,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, 1.0, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, 1.0, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
@@ -268,13 +268,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, 1.0, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v0, 1.0, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -318,14 +318,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, -v5, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, -v0, v1
+; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
@@ -340,14 +340,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, -v5, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v0, -v0, v1
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -394,14 +394,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, v5, -v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, -v1
+; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
@@ -416,14 +416,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, v5, -v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, -v1
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -470,14 +470,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, -v5, -v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, -v0, -v1
+; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
@@ -492,14 +492,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, -v5, -v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v0, -v0, -v1
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -547,14 +547,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
+; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
@@ -569,14 +569,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index e7d2fb202961..5153c6f1eb2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -313,16 +313,16 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -441,16 +441,16 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -569,16 +569,16 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -693,16 +693,16 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrsp
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; encoding: [0x20,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -821,16 +821,16 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -949,16 +949,16 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1081,16 +1081,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v3 ; encoding: [0x03,0x03,0x10,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1205,16 +1205,16 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspa
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x06,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -1333,16 +1333,16 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
 ; GFX10-NEXT:    image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:
@@ -3247,11 +3247,11 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)
 ; VERDE-LABEL: image_load_mmo:
 ; VERDE:       ; %bb.0:
 ; VERDE-NEXT:    image_load v1, v[1:2], s[0:7] dmask:0x1 unorm
-; VERDE-NEXT:    v_mov_b32_e32 v2, 0
+; VERDE-NEXT:    v_mov_b32_e32 v3, 0
 ; VERDE-NEXT:    s_mov_b32 m0, -1
-; VERDE-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
-; VERDE-NEXT:    ds_write_b32 v0, v2
-; VERDE-NEXT:    ds_write_b32 v3, v2
+; VERDE-NEXT:    ds_write_b32 v0, v3
+; VERDE-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
+; VERDE-NEXT:    ds_write_b32 v0, v3
 ; VERDE-NEXT:    s_waitcnt vmcnt(0)
 ; VERDE-NEXT:    v_mov_b32_e32 v0, v1
 ; VERDE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3291,9 +3291,9 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)
 ; GFX10-LABEL: image_load_mmo:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    image_load v1, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x11,0x00,0xf0,0x01,0x01,0x00,0x00]
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    ds_write2_b32 v0, v3, v3 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x03,0x03,0x00]
+; GFX10-NEXT:    ds_write2_b32 v0, v2, v2 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x02,0x02,0x00]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 59419a65c70e..52c486f18309 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -1783,13 +1783,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v1 ; encoding: [0x01,0x03,0x12,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v11, s13 ; encoding: [0x0d,0x02,0x16,0x7e]
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
 ; GFX10-NEXT:    image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08]
-; GFX10-NEXT:    v_mov_b32_e32 v10, s12 ; encoding: [0x0c,0x02,0x14,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s12 ; encoding: [0x0c,0x02,0x04,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v3, s13 ; encoding: [0x0d,0x02,0x06,0x7e]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    global_store_dword v[10:11], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x01,0x7d,0x00]
+; GFX10-NEXT:    global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    ; return to shader part epilog
 main_body:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 0bf3125b22bf..09a0ea1ee58c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -9,11 +9,11 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
 ; VARIANT0:       ; %bb.0: ; %entry
 ; VARIANT0-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; VARIANT0-NEXT:    s_load_dword s2, s[0:1], 0xb
-; VARIANT0-NEXT:    v_not_b32_e32 v3, v0
 ; VARIANT0-NEXT:    s_mov_b32 s7, 0xf000
 ; VARIANT0-NEXT:    s_mov_b32 s6, 0
 ; VARIANT0-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; VARIANT0-NEXT:    v_mov_b32_e32 v2, 0
+; VARIANT0-NEXT:    v_not_b32_e32 v3, v0
 ; VARIANT0-NEXT:    s_waitcnt lgkmcnt(0)
 ; VARIANT0-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
 ; VARIANT0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
@@ -30,11 +30,11 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
 ; VARIANT1:       ; %bb.0: ; %entry
 ; VARIANT1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; VARIANT1-NEXT:    s_load_dword s2, s[0:1], 0xb
-; VARIANT1-NEXT:    v_not_b32_e32 v3, v0
 ; VARIANT1-NEXT:    s_mov_b32 s7, 0xf000
 ; VARIANT1-NEXT:    s_mov_b32 s6, 0
 ; VARIANT1-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; VARIANT1-NEXT:    v_mov_b32_e32 v2, 0
+; VARIANT1-NEXT:    v_not_b32_e32 v3, v0
 ; VARIANT1-NEXT:    s_waitcnt lgkmcnt(0)
 ; VARIANT1-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
 ; VARIANT1-NEXT:    s_barrier
@@ -51,45 +51,45 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
 ; VARIANT2:       ; %bb.0: ; %entry
 ; VARIANT2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VARIANT2-NEXT:    s_load_dword s0, s[0:1], 0x2c
-; VARIANT2-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; VARIANT2-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; VARIANT2-NEXT:    s_waitcnt lgkmcnt(0)
-; VARIANT2-NEXT:    v_mov_b32_e32 v4, s3
-; VARIANT2-NEXT:    v_xad_u32 v1, v0, -1, s0
-; VARIANT2-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; VARIANT2-NEXT:    v_mov_b32_e32 v2, s3
+; VARIANT2-NEXT:    v_xad_u32 v3, v0, -1, s0
+; VARIANT2-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; VARIANT2-NEXT:    v_add_co_u32_e32 v1, vcc, s2, v1
+; VARIANT2-NEXT:    v_lshlrev_b64 v[3:4], 2, v[3:4]
+; VARIANT2-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; VARIANT2-NEXT:    global_store_dword v[1:2], v0, off
+; VARIANT2-NEXT:    v_mov_b32_e32 v0, s3
 ; VARIANT2-NEXT:    v_add_co_u32_e32 v3, vcc, s2, v3
-; VARIANT2-NEXT:    v_lshlrev_b64 v[1:2], 2, v[1:2]
-; VARIANT2-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; VARIANT2-NEXT:    global_store_dword v[3:4], v0, off
-; VARIANT2-NEXT:    v_mov_b32_e32 v5, s3
-; VARIANT2-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v1
-; VARIANT2-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
+; VARIANT2-NEXT:    v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
 ; VARIANT2-NEXT:    s_waitcnt vmcnt(0)
 ; VARIANT2-NEXT:    s_barrier
-; VARIANT2-NEXT:    global_load_dword v0, v[0:1], off
+; VARIANT2-NEXT:    global_load_dword v0, v[3:4], off
 ; VARIANT2-NEXT:    s_waitcnt vmcnt(0)
-; VARIANT2-NEXT:    global_store_dword v[3:4], v0, off
+; VARIANT2-NEXT:    global_store_dword v[1:2], v0, off
 ; VARIANT2-NEXT:    s_endpgm
 ;
 ; VARIANT3-LABEL: test_barrier:
 ; VARIANT3:       ; %bb.0: ; %entry
 ; VARIANT3-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VARIANT3-NEXT:    s_load_dword s0, s[0:1], 0x2c
-; VARIANT3-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; VARIANT3-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; VARIANT3-NEXT:    s_waitcnt lgkmcnt(0)
-; VARIANT3-NEXT:    v_mov_b32_e32 v4, s3
-; VARIANT3-NEXT:    v_xad_u32 v1, v0, -1, s0
-; VARIANT3-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; VARIANT3-NEXT:    v_mov_b32_e32 v2, s3
+; VARIANT3-NEXT:    v_xad_u32 v3, v0, -1, s0
+; VARIANT3-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; VARIANT3-NEXT:    v_add_co_u32_e32 v1, vcc, s2, v1
+; VARIANT3-NEXT:    v_lshlrev_b64 v[3:4], 2, v[3:4]
+; VARIANT3-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; VARIANT3-NEXT:    global_store_dword v[1:2], v0, off
+; VARIANT3-NEXT:    v_mov_b32_e32 v0, s3
 ; VARIANT3-NEXT:    v_add_co_u32_e32 v3, vcc, s2, v3
-; VARIANT3-NEXT:    v_lshlrev_b64 v[1:2], 2, v[1:2]
-; VARIANT3-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; VARIANT3-NEXT:    global_store_dword v[3:4], v0, off
-; VARIANT3-NEXT:    v_mov_b32_e32 v5, s3
-; VARIANT3-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v1
-; VARIANT3-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
+; VARIANT3-NEXT:    v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
 ; VARIANT3-NEXT:    s_barrier
-; VARIANT3-NEXT:    global_load_dword v0, v[0:1], off
+; VARIANT3-NEXT:    global_load_dword v0, v[3:4], off
 ; VARIANT3-NEXT:    s_waitcnt vmcnt(0)
-; VARIANT3-NEXT:    global_store_dword v[3:4], v0, off
+; VARIANT3-NEXT:    global_store_dword v[1:2], v0, off
 ; VARIANT3-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index f10b504a92bf..1c5bb08957d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -1537,8 +1537,8 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out)
 define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
 ; SI-LABEL: simplify_bfe_u32_multi_use_arg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s6, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 3801da79f1c4..60c6897b2adc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -85,20 +85,20 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_movk_i32 s9, 0xfc01
-; SI-NEXT:    s_mov_b32 s5, 0xfffff
-; SI-NEXT:    s_mov_b32 s4, -1
+; SI-NEXT:    s_movk_i32 s11, 0xfc01
+; SI-NEXT:    s_mov_b32 s9, 0xfffff
+; SI-NEXT:    s_mov_b32 s8, -1
 ; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
-; SI-NEXT:    s_brev_b32 s8, -2
+; SI-NEXT:    s_brev_b32 s10, -2
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_mov_b32 s7, 0x80000
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT:    v_add_i32_e32 v10, vcc, s9, v4
-; SI-NEXT:    v_lshr_b64 v[4:5], s[4:5], v10
+; SI-NEXT:    v_add_i32_e32 v10, vcc, s11, v4
+; SI-NEXT:    v_lshr_b64 v[4:5], s[8:9], v10
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v10
 ; SI-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
-; SI-NEXT:    v_bfi_b32 v11, s8, v8, v3
+; SI-NEXT:    v_bfi_b32 v11, s10, v8, v3
 ; SI-NEXT:    v_and_b32_e32 v9, v3, v5
 ; SI-NEXT:    v_and_b32_e32 v8, v2, v4
 ; SI-NEXT:    v_lshr_b64 v[6:7], s[6:7], v10
@@ -122,26 +122,26 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
 ;
 ; CI-LABEL: v_round_f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_mov_b32_e32 v5, 0x3ff00000
-; CI-NEXT:    v_mov_b32_e32 v4, 0
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    s_brev_b32 s6, -2
+; CI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_trunc_f64_e32 v[6:7], v[2:3]
-; CI-NEXT:    v_add_f64 v[8:9], v[2:3], -v[6:7]
-; CI-NEXT:    v_bfi_b32 v2, s2, v5, v3
-; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v2, vcc
-; CI-NEXT:    v_add_f64 v[2:3], v[6:7], v[4:5]
-; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[2:3]
+; CI-NEXT:    v_add_f64 v[6:7], v[2:3], -v[4:5]
+; CI-NEXT:    v_bfi_b32 v2, s6, v8, v3
+; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
+; CI-NEXT:    v_mov_b32_e32 v2, 0
+; CI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid

diff  --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 22d68bad99ae..b7df7a58e82c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -519,11 +519,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in,
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v0, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    ds_write_b16 v3, v0
-; GFX900-NEXT:    v_bfi_b32 v0, v2, v0, v1
+; GFX900-NEXT:    ds_write_b16 v2, v0
+; GFX900-NEXT:    v_bfi_b32 v0, v3, v0, v1
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -532,11 +532,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in,
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0
-; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    ds_write_b16 v3, v0
-; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
+; GFX906-NEXT:    ds_write_b16 v2, v0
+; GFX906-NEXT:    v_bfi_b32 v0, v3, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
@@ -580,10 +580,10 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in,
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0
+; GFX906-NEXT:    ds_write_b16 v3, v2
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    ds_write_b16 v4, v3
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
@@ -596,11 +596,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in,
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX803-NEXT:    v_mov_b32_e32 v3, 0
-; GFX803-NEXT:    ds_write_b16 v3, v1
+; GFX803-NEXT:    v_mov_b32_e32 v2, 0
+; GFX803-NEXT:    ds_write_b16 v2, v1
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX803-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -618,12 +618,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v0, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    ds_write_b16 v2, v0
-; GFX900-NEXT:    ds_write_b16 v3, v5
-; GFX900-NEXT:    v_bfi_b32 v0, v4, v0, v1
+; GFX900-NEXT:    ds_write_b16 v3, v4
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX900-NEXT:    v_bfi_b32 v0, v2, v0, v1
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -632,12 +632,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    ds_write_b16 v2, v0
-; GFX906-NEXT:    ds_write_b16 v3, v5
-; GFX906-NEXT:    v_bfi_b32 v0, v4, v0, v1
+; GFX906-NEXT:    ds_write_b16 v3, v4
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
@@ -648,11 +648,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX803-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    ds_write_b16 v2, v0
 ; GFX803-NEXT:    ds_write_b16 v3, v1
-; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1035,10 +1035,10 @@ define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1117,10 +1117,10 @@ define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1524,11 +1524,11 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:4094
-; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s33 offset:4094
 ; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
@@ -1606,11 +1606,11 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:4094
-; GFX803-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s33 offset:4094
 ; GFX803-NEXT:    s_mov_b32 s4, 0x5040c00
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
index bd1b5c39037d..ae8dad231a12 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
@@ -112,22 +112,22 @@ bb:
 define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) {
 ; GCN-LABEL: muli24_shl64:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GCN-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
-; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    buffer_load_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_or_b32_e32 v0, 0x800000, v1
+; GCN-NEXT:    v_or_b32_e32 v0, 0x800000, v0
 ; GCN-NEXT:    v_mul_i32_i24_e32 v0, -7, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64
+; GCN-NEXT:    buffer_store_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64
 ; GCN-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 844a200de738..ee9f8fa49a27 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -12,10 +12,10 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x30
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_pk_lshrrev_b16 v2, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_pk_lshrrev_b16 v2, s0, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -74,51 +74,51 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v4, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v0, v4
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_lshr_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b16_e32 v3, v2, v5
-; VI-NEXT:    v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshrrev_b16_e32 v4, v1, v0
+; VI-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v4, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_lshr_v2i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
 ; CI-NEXT:    s_mov_b32 s8, 0xffff
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -129,7 +129,7 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; CI-NEXT:    v_lshr_b32_e32 v3, v4, v5
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v3
-; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -153,13 +153,13 @@ define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshrrev_b16 v2, s0, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, s0, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_v_s_v2i16:
@@ -171,39 +171,39 @@ define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b16_e32 v2, s0, v3
-; VI-NEXT:    v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshrrev_b16_e32 v1, s0, v0
+; VI-NEXT:    v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: lshr_v_s_v2i16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s8, s[0:1], 0xd
+; CI-NEXT:    s_load_dword s0, s[0:1], 0xd
+; CI-NEXT:    s_mov_b32 s8, 0xffff
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_lshr_b32 s9, s0, 16
+; CI-NEXT:    s_and_b32 s10, s0, s8
 ; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; CI-NEXT:    s_lshr_b32 s9, s8, 16
-; CI-NEXT:    s_mov_b32 s10, 0xffff
-; CI-NEXT:    s_and_b32 s8, s8, s10
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; CI-NEXT:    v_and_b32_e32 v2, s10, v2
+; CI-NEXT:    v_and_b32_e32 v2, s8, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, s9, v3
-; CI-NEXT:    v_lshrrev_b32_e32 v2, s8, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v2, s10, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
@@ -228,13 +228,13 @@ define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v3, s0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v0, s0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_s_v_v2i16:
@@ -246,39 +246,39 @@ define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b16_e64 v2, v3, s0
-; VI-NEXT:    v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshrrev_b16_e64 v1, v0, s0
+; VI-NEXT:    v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: lshr_s_v_v2i16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s8, s[0:1], 0xd
+; CI-NEXT:    s_load_dword s0, s[0:1], 0xd
+; CI-NEXT:    s_mov_b32 s8, 0xffff
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_lshr_b32 s9, s0, 16
+; CI-NEXT:    s_and_b32 s10, s0, s8
 ; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; CI-NEXT:    s_lshr_b32 s9, s8, 16
-; CI-NEXT:    s_mov_b32 s10, 0xffff
-; CI-NEXT:    s_and_b32 s8, s8, s10
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; CI-NEXT:    v_and_b32_e32 v2, s10, v2
+; CI-NEXT:    v_and_b32_e32 v2, s8, v2
 ; CI-NEXT:    v_lshr_b32_e32 v3, s9, v3
-; CI-NEXT:    v_lshr_b32_e32 v2, s8, v2
+; CI-NEXT:    v_lshr_b32_e32 v2, s10, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
@@ -302,46 +302,46 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v3, 8 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v0, 8 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_imm_v_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 8
+; VI-NEXT:    v_mov_b32_e32 v4, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b16_e64 v2, v4, 8
-; VI-NEXT:    v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshrrev_b16_e64 v1, v0, 8
+; VI-NEXT:    v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: lshr_imm_v_v2i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -349,7 +349,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; CI-NEXT:    v_lshr_b32_e32 v2, 8, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v3
-; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -370,13 +370,13 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshrrev_b16 v2, 8, v3 op_sel_hi:[0,1]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_v_imm_v2i16:
@@ -387,32 +387,32 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: lshr_v_imm_v2i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
 ; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff, v2
-; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -428,60 +428,60 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; GFX9-LABEL: v_lshr_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v1, v3
-; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v1, v5
+; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v0, v4
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_lshr_v4i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b16_e32 v6, v3, v1
-; VI-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_lshrrev_b16_e32 v3, v2, v0
-; VI-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_lshrrev_b16_e32 v6, v5, v1
+; VI-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_lshrrev_b16_e32 v5, v4, v0
+; VI-NEXT:    v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_or_b32_e32 v1, v6, v1
-; VI-NEXT:    v_or_b32_e32 v0, v3, v0
-; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; VI-NEXT:    v_or_b32_e32 v0, v5, v0
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_lshr_v4i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
 ; CI-NEXT:    s_mov_b32 s8, 0xffff
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
@@ -500,7 +500,7 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; CI-NEXT:    v_or_b32_e32 v3, v3, v5
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v4
-; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -557,22 +557,22 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i
 ;
 ; CI-LABEL: lshr_v_imm_v4i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; CI-NEXT:    s_mov_b32 s8, 0xff00ff
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
 ; CI-NEXT:    v_and_b32_e32 v3, s8, v3
 ; CI-NEXT:    v_and_b32_e32 v2, s8, v2
-; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64

diff  --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index 178d111dd70a..13ade66bf8ff 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -16,14 +16,14 @@ define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrs
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_ushort v5, v[0:1]
-; VI-NEXT:    flat_load_ushort v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    flat_load_ushort v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_i16_e32 v2, v5, v2
-; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    v_max_i16_e32 v0, v0, v1
+; VI-NEXT:    flat_store_short v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_imax_sge_i16:
@@ -38,14 +38,14 @@ define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_ushort v5, v[0:1], off
-; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_i16_e32 v2, v5, v2
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_max_i16_e32 v0, v0, v1
+; GFX9-NEXT:    global_store_short v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
@@ -73,16 +73,16 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_i16_e32 v3, v5, v2
-; VI-NEXT:    v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_max_i16_e32 v2, v0, v1
+; VI-NEXT:    v_max_i16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_imax_sge_v2i16:
@@ -97,14 +97,14 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_max_i16 v2, v5, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
@@ -124,35 +124,35 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v6
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v6
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v7, s5
-; VI-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; VI-NEXT:    flat_load_ushort v8, v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 4, v0
+; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ushort v8, v[6:7]
 ; VI-NEXT:    flat_load_dword v9, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
 ; VI-NEXT:    flat_load_dword v1, v[2:3]
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v6
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 4, v4
+; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
 ; VI-NEXT:    v_max_i16_e32 v0, v8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_i16_e32 v2, v9, v1
 ; VI-NEXT:    v_max_i16_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    flat_store_short v[4:5], v0
-; VI-NEXT:    flat_store_dword v[6:7], v1
+; VI-NEXT:    flat_store_short v[6:7], v0
+; VI-NEXT:    flat_store_dword v[4:5], v1
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_imax_sge_v3i16:
@@ -160,8 +160,8 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v4
@@ -169,19 +169,18 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_short_d16 v6, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v7, v[0:1], off
-; GFX9-NEXT:    global_load_short_d16 v5, v[2:3], off offset:4
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_pk_max_i16 v3, v6, v5
+; GFX9-NEXT:    global_load_short_d16 v7, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_short_d16 v6, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_max_i16 v2, v7, v2
-; GFX9-NEXT:    global_store_short v[0:1], v3, off offset:4
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX9-NEXT:    v_pk_max_i16 v1, v7, v6
+; GFX9-NEXT:    global_store_short v[4:5], v1, off offset:4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
@@ -272,14 +271,14 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrs
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_ushort v5, v[0:1]
-; VI-NEXT:    flat_load_ushort v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    flat_load_ushort v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_i16_e32 v2, v5, v2
-; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    v_max_i16_e32 v0, v0, v1
+; VI-NEXT:    flat_store_short v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_imax_sgt_i16:
@@ -294,14 +293,14 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_ushort v5, v[0:1], off
-; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_i16_e32 v2, v5, v2
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_max_i16_e32 v0, v0, v1
+; GFX9-NEXT:    global_store_short v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
@@ -329,14 +328,14 @@ define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrs
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_ushort v5, v[0:1]
-; VI-NEXT:    flat_load_ushort v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    flat_load_ushort v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_u16_e32 v2, v5, v2
-; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    v_max_u16_e32 v0, v0, v1
+; VI-NEXT:    flat_store_short v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_umax_uge_i16:
@@ -351,14 +350,14 @@ define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_ushort v5, v[0:1], off
-; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_u16_e32 v2, v5, v2
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_max_u16_e32 v0, v0, v1
+; GFX9-NEXT:    global_store_short v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
@@ -386,14 +385,14 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrs
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_ushort v5, v[0:1]
-; VI-NEXT:    flat_load_ushort v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    flat_load_ushort v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_u16_e32 v2, v5, v2
-; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    v_max_u16_e32 v0, v0, v1
+; VI-NEXT:    flat_store_short v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_umax_ugt_i16:
@@ -408,14 +407,14 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_ushort v5, v[0:1], off
-; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_u16_e32 v2, v5, v2
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_max_u16_e32 v0, v0, v1
+; GFX9-NEXT:    global_store_short v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
@@ -442,16 +441,16 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_u16_e32 v3, v5, v2
-; VI-NEXT:    v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_max_u16_e32 v2, v0, v1
+; VI-NEXT:    v_max_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    flat_store_dword v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_umax_ugt_v2i16:
@@ -466,14 +465,14 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_max_u16 v2, v5, v2
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
index dbe03109b819..88cb57ca0cdc 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
@@ -25,34 +25,34 @@ body:             |
   ; CHECK:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec
   ; CHECK:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec
-  ; CHECK:   undef %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
-  ; CHECK:   %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+  ; CHECK:   undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
   ; CHECK:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK:   undef %11.sub1:vreg_64 = IMPLICIT_DEF
-  ; CHECK:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
   ; CHECK:   undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $exec
   ; CHECK:   dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $exec
+  ; CHECK:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec
-  ; CHECK:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec
+  ; CHECK:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK:   %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $exec
+  ; CHECK:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK:   %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
   ; CHECK:   GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec
-  ; CHECK:   %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, 0, 0, implicit $exec
-  ; CHECK:   [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec
   ; CHECK:   dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, implicit $exec
-  ; CHECK:   dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, 0, 0, implicit $exec
-  ; CHECK:   [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF2]], implicit $exec
-  ; CHECK:   dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, 0, 0, implicit $exec
-  ; CHECK:   S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]]
-  ; CHECK:   GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF1]], implicit $exec
+  ; CHECK:   dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   S_NOP 0, implicit [[DEF5]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF4]], implicit [[V_MOV_B32_e32_]]
+  ; CHECK:   GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   S_SETREG_IMM32_B32 0, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
index 47c8e19d63c4..e9f0678f7782 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
@@ -39,6 +39,8 @@ body:             |
   ; CHECK:   successors: %bb.1(0x80000000)
   ; CHECK:   INLINEASM &"", 1, 851978, def dead %11
   ; CHECK:   GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+  ; CHECK:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK:   [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load 8, addrspace 3)
   ; CHECK:   INLINEASM &"def $0 $1", 1, 851978, def %15, 851978, def %16
   ; CHECK:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
@@ -46,24 +48,22 @@ body:             |
   ; CHECK:   [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
   ; CHECK:   INLINEASM &"def $0 $1", 1, 851978, def %21, 851978, def %22
   ; CHECK:   [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
-  ; CHECK:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
-  ; CHECK:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec
-  ; CHECK:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U32_e64 64, [[V_ADD_U32_e32_]], implicit $exec
-  ; CHECK:   [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
-  ; CHECK:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK:   INLINEASM &"", 1, 851978, def dead [[V_MOV_B32_e32_2]], 851978, def dead [[V_MOV_B32_e32_3]], 851977, [[DS_READ_B64_gfx9_]].sub0, 2147483657, [[V_MOV_B32_e32_2]](tied-def 3), 2147549193, [[V_MOV_B32_e32_3]](tied-def 5), 851977, %15, 851977, %16, 851977, [[DS_READ_B32_gfx9_1]], 851977, [[DS_READ_B32_gfx9_]], 851977, [[DS_READ_B32_gfx9_3]], 851977, [[DS_READ_B32_gfx9_2]]
+  ; CHECK:   %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
   ; CHECK:   DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store 4, addrspace 3)
   ; CHECK:   DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store 4, addrspace 3)
   ; CHECK:   DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3)
   ; CHECK:   undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+  ; CHECK:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec
+  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; CHECK:   [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
   ; CHECK:   [[V_MUL_LO_U32_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[V_ADD_U32_e32_]], [[S_MOV_B32_]], implicit $exec
+  ; CHECK:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U32_e64 64, [[V_ADD_U32_e32_]], implicit $exec
   ; CHECK:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_ADD_U32_e32_]], [[V_CMP_GT_U32_e64_]], implicit $exec
   ; CHECK:   [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_]], [[DEF1]], implicit $exec
   ; CHECK:   [[V_MUL_LO_U32_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[V_CNDMASK_B32_e64_]], [[S_MOV_B32_]], implicit $exec
   ; CHECK:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_SUB_U32_e32_]], [[DEF]].sub0, implicit $exec
   ; CHECK:   [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_1]], [[V_MUL_LO_U32_]], implicit $exec
-  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; CHECK:   [[DEF]].sub0:vreg_64 = V_ADD_U32_e32 [[V_SUB_U32_e32_1]], [[V_ADD_U32_e32_1]], implicit $exec
   ; CHECK:   undef %38.sub0:vreg_64, %39:sreg_64_xexec = V_ADD_I32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[DEF]].sub0, 0, implicit $exec
   ; CHECK:   undef %40.sub1:vreg_64, dead %41:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[DEF]].sub1, %39, 0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir b/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir
index 63523167ac14..e8f5b44e2fc9 100644
--- a/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir
@@ -19,10 +19,10 @@ body:             |
     ; GCN-LABEL: name: handleMove_bundle
     ; GCN: liveins: $sgpr4_sgpr5
     ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-    ; GCN: $vcc_hi = IMPLICIT_DEF
-    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0, 0 :: (dereferenceable invariant load 4, align 16, addrspace 4)
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN: $vcc_hi = IMPLICIT_DEF
     ; GCN: DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store 4, addrspace 3)
     ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
     ; GCN: $m0 = S_MOV_B32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 1a6059a0eb86..f37dfbd92e25 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1006,33 +1006,33 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ;
 ; GCN-IR-LABEL: s_test_sdiv24_48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xe
-; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xb
+; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xc
-; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_sext_i32_i16 s7, s2
-; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
-; GCN-IR-NEXT:    s_sext_i32_i16 s5, s3
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 24
-; GCN-IR-NEXT:    s_ashr_i32 s4, s5, 31
-; GCN-IR-NEXT:    s_mov_b32 s5, s4
-; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 24
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s10, s8, s4
+; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[2:3], 24
+; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
+; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_subb_u32 s11, s9, s4
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
-; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s6
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
+; GCN-IR-NEXT:    s_ashr_i32 s6, s5, 31
+; GCN-IR-NEXT:    s_ashr_i64 s[12:13], s[4:5], 24
+; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[2:3], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s10, s4, s2
+; GCN-IR-NEXT:    s_mov_b32 s7, s6
+; GCN-IR-NEXT:    s_subb_u32 s11, s5, s2
+; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[6:7], s[12:13]
+; GCN-IR-NEXT:    s_sub_u32 s8, s4, s6
+; GCN-IR-NEXT:    s_subb_u32 s9, s5, s6
+; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s8
 ; GCN-IR-NEXT:    s_add_i32 s0, s0, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s1, s7
+; GCN-IR-NEXT:    s_flbit_i32_b32 s1, s9
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
 ; GCN-IR-NEXT:    s_add_i32 s0, s0, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s1, s11
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
@@ -1042,7 +1042,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
 ; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[0:1], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[8:9], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[10:11], 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
@@ -1074,10 +1074,10 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:  BB9_4: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
 ; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[10:11], v4
-; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
+; GCN-IR-NEXT:    s_add_u32 s10, s8, -1
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, -1
 ; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
@@ -1092,9 +1092,9 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s10, v6
 ; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v10, s6, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s8, v8
 ; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
-; GCN-IR-NEXT:    v_and_b32_e32 v11, s7, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s9, v8
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
@@ -1112,16 +1112,16 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GCN-IR-NEXT:  BB9_7: ; %udiv-end
-; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[6:7], s[2:3]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s10, -1
-; GCN-IR-NEXT:    buffer_store_short v1, off, s[8:11], 0 offset:4
-; GCN-IR-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index a0a4c2a495c7..c96ff256c8c6 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -281,28 +281,28 @@ define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrs
 define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; GCN-LABEL: shl_i16_computed_amount:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s10, s6
-; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s10, s2
+; GCN-NEXT:    s_mov_b32 s11, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s2
-; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    s_mov_b32 s8, s6
+; GCN-NEXT:    s_mov_b32 s9, s7
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s14, 0
-; GCN-NEXT:    s_mov_b32 s15, s7
-; GCN-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-NEXT:    s_mov_b32 s15, s3
+; GCN-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GCN-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
 ; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-NEXT:    v_lshl_b32_e32 v0, v2, v0
-; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; EG-LABEL: shl_i16_computed_amount:
@@ -402,35 +402,35 @@ define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
 define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 ; GCN-LABEL: shl_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s10, s6
-; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s10, s2
+; GCN-NEXT:    s_mov_b32 s11, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s2
-; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    s_mov_b32 s8, s6
+; GCN-NEXT:    s_mov_b32 s9, s7
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s14, 0
-; GCN-NEXT:    s_mov_b32 s15, s7
-; GCN-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-NEXT:    s_mov_b32 s15, s3
+; GCN-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GCN-NEXT:    buffer_load_dword v2, off, s[8:11], 0
 ; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s0, 0xffff
-; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s4, 0xffff
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GCN-NEXT:    v_lshl_b32_e32 v0, v2, v0
 ; GCN-NEXT:    v_lshl_b32_e32 v1, v1, v3
-; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; EG-LABEL: shl_v2i16:
@@ -481,17 +481,17 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
 define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 ; GCN-LABEL: shl_v4i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
+; GCN-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
 ; GCN-NEXT:    s_mov_b32 s8, 0xffff
-; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -510,7 +510,7 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
 ; GCN-NEXT:    v_and_b32_e32 v2, s8, v2
 ; GCN-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GCN-NEXT:    v_or_b32_e32 v2, v2, v4
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_endpgm
 ;
 ; EG-LABEL: shl_v4i16:
@@ -869,20 +869,20 @@ define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %
 define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-LABEL: v_shl_32_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_lshl_b64 s[8:9], s[2:3], 3
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[0:1], s[6:7]
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64
-; GCN-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[4:5], s[10:11]
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
 ; GCN-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_shl_32_i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 7cfc8b18d216..97184c5a0923 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -73,51 +73,51 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshlrev_b16 v2, v4, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v0, v4
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_shl_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v5, v[0:1]
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v5
-; VI-NEXT:    v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshlrev_b16_e32 v4, v1, v0
+; VI-NEXT:    v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v4, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_shl_v2i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
 ; CI-NEXT:    s_mov_b32 s8, 0xffff
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -128,7 +128,7 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_and_b32_e32 v2, s8, v2
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v3
-; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -152,13 +152,13 @@ define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshlrev_b16 v2, s0, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s0, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_v_s_v2i16:
@@ -170,39 +170,39 @@ define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, s0, v3
-; VI-NEXT:    v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, s0, v0
+; VI-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: shl_v_s_v2i16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s8, s[0:1], 0xd
+; CI-NEXT:    s_load_dword s0, s[0:1], 0xd
+; CI-NEXT:    s_mov_b32 s8, 0xffff
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_lshr_b32 s9, s0, 16
+; CI-NEXT:    s_and_b32 s10, s0, s8
 ; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; CI-NEXT:    s_mov_b32 s9, 0xffff
-; CI-NEXT:    s_lshr_b32 s10, s8, 16
-; CI-NEXT:    s_and_b32 s8, s8, s9
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; CI-NEXT:    v_lshlrev_b32_e32 v2, s8, v2
-; CI-NEXT:    v_lshlrev_b32_e32 v3, s10, v3
-; CI-NEXT:    v_and_b32_e32 v2, s9, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, s10, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v3, s9, v3
+; CI-NEXT:    v_and_b32_e32 v2, s8, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
@@ -227,13 +227,13 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshlrev_b16 v2, v3, s0
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v0, s0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_s_v_v2i16:
@@ -245,17 +245,17 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
+; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e64 v2, v3, s0
-; VI-NEXT:    v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshlrev_b16_e64 v1, v0, s0
+; VI-NEXT:    v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: shl_s_v_v2i16:
@@ -270,12 +270,12 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
 ; CI-NEXT:    s_mov_b32 s0, 0xffff
-; CI-NEXT:    s_lshr_b32 s1, s8, 16
+; CI-NEXT:    s_lshr_b32 s9, s8, 16
 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v3, s0, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_lshl_b32_e32 v2, s1, v2
+; CI-NEXT:    v_lshl_b32_e32 v2, s9, v2
 ; CI-NEXT:    v_lshl_b32_e32 v3, s8, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    v_and_b32_e32 v3, s0, v3
@@ -301,46 +301,46 @@ define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v0, 8 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_imm_v_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 8
+; VI-NEXT:    v_mov_b32_e32 v4, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e64 v2, v4, 8
-; VI-NEXT:    v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshlrev_b16_e64 v1, v0, 8
+; VI-NEXT:    v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: shl_imm_v_v2i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -349,7 +349,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    v_and_b32_e32 v3, 0xfff8, v3
 ; CI-NEXT:    v_or_b32_e32 v2, v3, v2
-; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -370,13 +370,13 @@ define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_v_imm_v2i16:
@@ -387,33 +387,33 @@ define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; VI-NEXT:    v_and_b32_e32 v2, 0xff000000, v2
-; VI-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
+; VI-NEXT:    v_and_b32_e32 v1, 0xff000000, v1
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: shl_v_imm_v2i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff00, v2
-; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -429,60 +429,60 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; GFX9-LABEL: v_shl_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, v3
-; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, v5
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v0, v4
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_shl_v4i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v0
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v6, v3, v1
-; VI-NEXT:    v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v0
-; VI-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_lshlrev_b16_e32 v6, v5, v1
+; VI-NEXT:    v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_lshlrev_b16_e32 v5, v4, v0
+; VI-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_or_b32_e32 v1, v6, v1
-; VI-NEXT:    v_or_b32_e32 v0, v3, v0
-; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; VI-NEXT:    v_or_b32_e32 v0, v5, v0
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: v_shl_v4i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
 ; CI-NEXT:    s_mov_b32 s8, 0xffff
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -501,7 +501,7 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
 ; CI-NEXT:    v_and_b32_e32 v2, s8, v2
 ; CI-NEXT:    v_or_b32_e32 v3, v3, v5
 ; CI-NEXT:    v_or_b32_e32 v2, v2, v4
-; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -538,22 +538,22 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_mov_b32 s4, 0xff000000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    s_mov_b32 s0, 0xff000000
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
 ; VI-NEXT:    v_lshlrev_b16_e32 v5, 8, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT:    v_and_b32_e32 v0, s4, v0
+; VI-NEXT:    v_and_b32_e32 v0, s0, v0
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_and_b32_e32 v4, s4, v4
+; VI-NEXT:    v_and_b32_e32 v4, s0, v4
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v5, v0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -561,16 +561,16 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
 ;
 ; CI-LABEL: shl_v_imm_v4i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; CI-NEXT:    s_mov_b32 s8, 0xff00
-; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
@@ -580,7 +580,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; CI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff00, v2
-; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64

diff  --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index e73003b83e7c..cdac927bdd74 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -32,13 +32,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrs
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 64, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_subrev_u32_e32 v0, vcc, 64, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_64:
@@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 64, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_subrev_u32_e32 v0, 64, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_x_sub_64:
@@ -203,13 +203,13 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrs
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, 64, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, 64, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_64_sub_x:
@@ -220,13 +220,13 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_64_sub_x:
@@ -279,13 +279,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrs
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffbf, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xffffffbf, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_65:
@@ -296,13 +296,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffbf, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_add_u32_e32 v0, 0xffffffbf, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_x_sub_65:
@@ -355,13 +355,13 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrs
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, 0x41, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0x41, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_65_sub_x:
@@ -372,13 +372,13 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v2, 0x41, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x41, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_65_sub_x:
@@ -431,13 +431,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 ad
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 16, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_neg16:
@@ -448,13 +448,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 ad
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, 16, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_add_u32_e32 v0, 16, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_x_sub_neg16:
@@ -507,13 +507,13 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 ad
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, -16, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_neg16_sub_x:
@@ -524,13 +524,13 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 ad
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v2, -16, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_sub_u32_e32 v0, -16, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_neg16_sub_x:
@@ -583,13 +583,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 ad
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 17, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 17, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_neg17:
@@ -600,13 +600,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 ad
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, 17, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_add_u32_e32 v0, 17, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_x_sub_neg17:
@@ -659,13 +659,13 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 ad
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, 0xffffffef, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0xffffffef, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i32_neg17_sub_x:
@@ -676,13 +676,13 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 ad
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v2, 0xffffffef, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0xffffffef, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_neg17_sub_x:
@@ -781,13 +781,13 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrs
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ushort v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_subrev_u16_e32 v2, 64, v3
-; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    v_subrev_u16_e32 v0, 64, v0
+; VI-NEXT:    flat_store_short v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64:
@@ -798,13 +798,13 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_ushort v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_subrev_u16_e32 v2, 64, v3
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_subrev_u16_e32 v0, 64, v0
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i16_x_sub_64:
@@ -855,36 +855,36 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_ushort v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
+; VI-NEXT:    flat_load_ushort v0, v[1:2]
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_subrev_u16_e32 v2, 64, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_subrev_u16_e32 v0, 64, v0
+; VI-NEXT:    flat_store_dword v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_ushort v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s2, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, s0, v0
+; GFX9-NEXT:    global_load_ushort v0, v[1:2], off
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_subrev_u16_e32 v2, 64, v3
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_subrev_u16_e32 v0, 64, v0
+; GFX9-NEXT:    global_store_dword v[3:4], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
@@ -1035,20 +1035,20 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 64
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 64
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_subrev_u16_e32 v3, 64, v4
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_subrev_u16_e32 v0, 64, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
@@ -1059,13 +1059,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 64 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
@@ -1118,38 +1118,38 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 64
+; VI-NEXT:    v_mov_b32_e32 v4, 64
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, -7, v4
-; VI-NEXT:    v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_e32 v1, -7, v0
+; VI-NEXT:    v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_7_64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x400007
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_mov_b32 s0, 0x400007
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v2, v3, s4
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
@@ -1202,38 +1202,38 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0xffffff85
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0xffffff85
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_subrev_u16_e32 v3, 64, v4
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_subrev_u16_e32 v0, 64, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_64_123:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x7b0040
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_mov_b32 s0, 0x7b0040
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v2, v3, s4
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
@@ -1290,15 +1290,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_add_u16_e32 v3, -7, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT:    v_add_u16_e32 v0, -7, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
@@ -1309,13 +1309,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v2, v3, 7
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 7
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
@@ -1365,19 +1365,19 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, -16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, -16
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
@@ -1388,13 +1388,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
@@ -1443,19 +1443,19 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0:
@@ -1466,13 +1466,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -4.0 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
@@ -1521,19 +1521,19 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0xffffbc00
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0xffffbc00
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0:
@@ -1544,13 +1544,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* %
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 4.0 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
@@ -1604,20 +1604,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 32
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_subrev_u16_e32 v3, 32, v4
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_subrev_u16_e32 v0, 32, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
@@ -1628,13 +1628,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 32 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
@@ -1683,19 +1683,19 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 32
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
@@ -1706,13 +1706,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %o
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
@@ -1768,15 +1768,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_subrev_u16_e32 v3, 32, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT:    v_subrev_u16_e32 v0, 32, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
@@ -1787,13 +1787,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, 32
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 32
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
@@ -1847,20 +1847,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, -16
+; VI-NEXT:    v_mov_b32_e32 v4, -16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, -16, v4
-; VI-NEXT:    v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_e32 v1, -16, v0
+; VI-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
@@ -1871,13 +1871,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 16 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
@@ -1926,19 +1926,19 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, -16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, -16
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
@@ -1949,13 +1949,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %o
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
@@ -2011,15 +2011,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %o
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT:    v_add_u16_e32 v3, -16, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT:    v_add_u16_e32 v0, -16, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
@@ -2030,13 +2030,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %o
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, 16
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 16
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
@@ -2089,20 +2089,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)*
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0xffffc400
+; VI-NEXT:    v_mov_b32_e32 v4, 0xffffc400
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, 0xffffc400, v4
-; VI-NEXT:    v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_e32 v1, 0xffffc400, v0
+; VI-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone:
@@ -2113,13 +2113,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)*
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone:
@@ -2172,20 +2172,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
+; VI-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, 4.0, v4
-; VI-NEXT:    v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_e32 v1, 4.0, v0
+; VI-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone:
@@ -2196,13 +2196,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, -1.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone:
@@ -2255,20 +2255,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)*
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, 2.0, v4
-; VI-NEXT:    v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_e32 v1, 2.0, v0
+; VI-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
@@ -2279,13 +2279,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)*
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, -2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
@@ -2338,20 +2338,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 0xffffc000
+; VI-NEXT:    v_mov_b32_e32 v4, 0xffffc000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, 0xffffc000, v4
-; VI-NEXT:    v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_add_u16_e32 v1, 0xffffc000, v0
+; VI-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
@@ -2362,13 +2362,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
@@ -2418,18 +2418,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v4, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 32
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
@@ -2440,13 +2440,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
@@ -2500,13 +2500,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v3, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_subrev_u16_e32 v2, 32, v3
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_subrev_u16_e32 v0, 32, v0
+; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef:
@@ -2517,13 +2517,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_u16 v2, v3, 32
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 32
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_undef:

diff  --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index ef9bf7619512..1a585f8b39be 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -294,9 +294,9 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, v0
 ; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -309,9 +309,9 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, v0
 ; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 1a4ef205cb1c..b31c4d00fb9f 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1185,25 +1185,25 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ;
 ; GCN-IR-LABEL: s_test_srem24_48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xb
-; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xc
+; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xc
 ; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_sext_i32_i16 s5, s3
-; GCN-IR-NEXT:    s_ashr_i32 s10, s5, 31
-; GCN-IR-NEXT:    s_sext_i32_i16 s3, s6
+; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
 ; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[2:3], 24
+; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 24
+; GCN-IR-NEXT:    s_ashr_i32 s4, s5, 31
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 24
-; GCN-IR-NEXT:    s_mov_b32 s11, s10
+; GCN-IR-NEXT:    s_mov_b32 s5, s4
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
-; GCN-IR-NEXT:    s_sub_u32 s8, s4, s10
-; GCN-IR-NEXT:    s_subb_u32 s9, s5, s10
+; GCN-IR-NEXT:    s_sub_u32 s8, s8, s4
+; GCN-IR-NEXT:    s_subb_u32 s9, s9, s4
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s0, s8
 ; GCN-IR-NEXT:    s_add_i32 s0, s0, 32

diff  --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index d9a0ae7ffda8..16c6676ae582 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -8,10 +8,10 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIVI-NEXT:    s_mov_b32 m0, -1
-; CIVI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; CIVI-NEXT:    ds_write_b16 v0, v2 offset:4
 ; CIVI-NEXT:    ds_write_b32 v0, v1
-; CIVI-NEXT:    ds_write_b8 v0, v3 offset:6
+; CIVI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; CIVI-NEXT:    ds_write_b8 v0, v1 offset:6
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -53,24 +53,24 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
 ; FIJI-LABEL: local_store_i55:
 ; FIJI:       ; %bb.0:
 ; FIJI-NEXT:    s_load_dword s0, s[4:5], 0x0
-; FIJI-NEXT:    s_load_dword s1, s[4:5], 0x8
-; FIJI-NEXT:    s_load_dword s2, s[4:5], 0xc
+; FIJI-NEXT:    s_load_dword s2, s[4:5], 0x8
+; FIJI-NEXT:    s_load_dword s1, s[4:5], 0xc
 ; FIJI-NEXT:    s_mov_b32 m0, -1
 ; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIJI-NEXT:    v_mov_b32_e32 v2, s0
-; FIJI-NEXT:    v_mov_b32_e32 v3, s1
-; FIJI-NEXT:    s_and_b32 s3, s2, 0xffff
+; FIJI-NEXT:    s_and_b32 s3, s1, 0xffff
 ; FIJI-NEXT:    s_add_u32 s0, s4, 14
+; FIJI-NEXT:    v_mov_b32_e32 v3, s1
 ; FIJI-NEXT:    s_addc_u32 s1, s5, 0
 ; FIJI-NEXT:    v_mov_b32_e32 v0, s0
 ; FIJI-NEXT:    v_mov_b32_e32 v1, s1
 ; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
-; FIJI-NEXT:    v_mov_b32_e32 v1, s2
-; FIJI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; FIJI-NEXT:    ds_write_b16 v2, v3 offset:4
+; FIJI-NEXT:    v_mov_b32_e32 v3, s2
+; FIJI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
 ; FIJI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; FIJI-NEXT:    v_or_b32_e32 v0, s3, v0
 ; FIJI-NEXT:    v_bfe_u32 v0, v0, 16, 7
-; FIJI-NEXT:    ds_write_b16 v2, v1 offset:4
 ; FIJI-NEXT:    ds_write_b8 v2, v0 offset:6
 ; FIJI-NEXT:    ds_write_b32 v2, v3
 ; FIJI-NEXT:    s_endpgm
@@ -109,9 +109,9 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
 ; HAWAII-NEXT:    s_mov_b32 m0, -1
 ; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
+; HAWAII-NEXT:    v_mov_b32_e32 v1, s2
+; HAWAII-NEXT:    ds_write_b16 v0, v1 offset:4
 ; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
-; HAWAII-NEXT:    v_mov_b32_e32 v2, s2
-; HAWAII-NEXT:    ds_write_b16 v0, v2 offset:4
 ; HAWAII-NEXT:    ds_write_b32 v0, v1
 ; HAWAII-NEXT:    s_endpgm
 ;
@@ -123,9 +123,9 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
 ; FIJI-NEXT:    s_mov_b32 m0, -1
 ; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIJI-NEXT:    v_mov_b32_e32 v0, s0
+; FIJI-NEXT:    v_mov_b32_e32 v1, s2
+; FIJI-NEXT:    ds_write_b16 v0, v1 offset:4
 ; FIJI-NEXT:    v_mov_b32_e32 v1, s1
-; FIJI-NEXT:    v_mov_b32_e32 v2, s2
-; FIJI-NEXT:    ds_write_b16 v0, v2 offset:4
 ; FIJI-NEXT:    ds_write_b32 v0, v1
 ; FIJI-NEXT:    s_endpgm
 ;
@@ -136,10 +136,10 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
 ; GFX9-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    ds_write_b16 v0, v2 offset:4
-; GFX9-NEXT:    ds_write_b32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    ds_write_b16 v0, v1 offset:4
+; GFX9-NEXT:    ds_write_b32 v0, v2
 ; GFX9-NEXT:    s_endpgm
   store i48 %arg, i48 addrspace(3)* %ptr, align 8
   ret void
@@ -154,11 +154,11 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0
 ; HAWAII-NEXT:    s_mov_b32 m0, -1
 ; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
 ; HAWAII-NEXT:    v_mov_b32_e32 v2, s2
+; HAWAII-NEXT:    s_and_b32 s3, s3, 1
+; HAWAII-NEXT:    v_mov_b32_e32 v0, s3
+; HAWAII-NEXT:    ds_write_b8 v2, v0 offset:8
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
 ; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
-; HAWAII-NEXT:    s_and_b32 s0, s3, 1
-; HAWAII-NEXT:    v_mov_b32_e32 v3, s0
-; HAWAII-NEXT:    ds_write_b8 v2, v3 offset:8
 ; HAWAII-NEXT:    ds_write_b64 v2, v[0:1]
 ; HAWAII-NEXT:    s_endpgm
 ;
@@ -170,11 +170,11 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0
 ; FIJI-NEXT:    s_mov_b32 m0, -1
 ; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIJI-NEXT:    v_mov_b32_e32 v2, s2
+; FIJI-NEXT:    s_and_b32 s3, s3, 1
+; FIJI-NEXT:    v_mov_b32_e32 v0, s3
+; FIJI-NEXT:    ds_write_b8 v2, v0 offset:8
 ; FIJI-NEXT:    v_mov_b32_e32 v0, s0
 ; FIJI-NEXT:    v_mov_b32_e32 v1, s1
-; FIJI-NEXT:    s_and_b32 s0, s3, 1
-; FIJI-NEXT:    v_mov_b32_e32 v3, s0
-; FIJI-NEXT:    ds_write_b8 v2, v3 offset:8
 ; FIJI-NEXT:    ds_write_b64 v2, v[0:1]
 ; FIJI-NEXT:    s_endpgm
 ;
@@ -186,9 +186,9 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_and_b32 s0, s3, 1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    ds_write_b8 v2, v3 offset:8
 ; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -222,9 +222,9 @@ define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 {
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIVI-NEXT:    s_mov_b32 m0, -1
-; CIVI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; CIVI-NEXT:    ds_write_b16 v0, v1
-; CIVI-NEXT:    ds_write_b8 v0, v2 offset:2
+; CIVI-NEXT:    v_bfe_u32 v1, v1, 16, 1
+; CIVI-NEXT:    ds_write_b8 v0, v1 offset:2
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 495e79bb711d..06e4d0f4935e 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -7,7 +7,7 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX9-LABEL: v_test_sub_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
@@ -15,8 +15,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v1, v[2:3], off
@@ -30,7 +30,7 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; VI-LABEL: v_test_sub_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
@@ -38,8 +38,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s8, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    flat_load_dword v1, v[2:3]
@@ -166,42 +166,42 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out
 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s8, 0x1c8007b
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x1c8007b
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_constant:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xfffffe38
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v1, 0xffffff85, v0
 ; VI-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -216,42 +216,42 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s8, 0xfc21fcb3
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0xfc21fcb3
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_neg_constant:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x3df
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v1, 0x34d, v0
 ; VI-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -265,41 +265,41 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_inline_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, 1
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v1, 1, v0
 ; VI-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -313,40 +313,40 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
 define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 32
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; VI-NEXT:    v_subrev_u16_e32 v0, 32, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -361,41 +361,41 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s8, 1.0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s4, 1.0
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v2, 0xffffc080
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0xffffc080
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -411,7 +411,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
@@ -419,8 +419,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v1, v[2:3], off
@@ -436,7 +436,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
@@ -444,8 +444,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s8, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_dword v1, v[0:1]
 ; VI-NEXT:    flat_load_dword v2, v[2:3]
@@ -473,7 +473,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
@@ -481,14 +481,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s8, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    global_load_dword v1, v[2:3], off
+; GFX9-NEXT:    global_load_dword v1, v[4:5], off
 ; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v1
@@ -500,26 +500,26 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s6, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s8, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v0
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    flat_load_dword v4, v[4:5]
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u16_e32 v0, v2, v4
-; VI-NEXT:    v_sub_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_sub_u16_e32 v0, v4, v2
+; VI-NEXT:    v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -539,7 +539,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
@@ -547,8 +547,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v1, v[2:3], off
@@ -564,7 +564,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
 ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
@@ -572,8 +572,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s8, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    flat_load_dword v1, v[2:3]
@@ -603,7 +603,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
@@ -611,8 +611,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v1, v[2:3], off
@@ -631,7 +631,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
 ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
@@ -639,8 +639,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s8, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    flat_load_dword v1, v[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index a545b756fae0..8925912f71c7 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -704,41 +704,41 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    s_load_dword s3, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s5, 0xff000000
 ; GCN-NEXT:    s_mov_b32 s4, 0xffff
-; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v2, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s2, s2, s5
 ; GCN-NEXT:    s_and_b32 s3, s3, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 24
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 24
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-NEXT:    s_load_dword s6, s[0:1], 0xb
 ; GCN-NEXT:    s_load_dword s7, s[0:1], 0xc
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
 ; GCN-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v0
-; GCN-NEXT:    v_rcp_f32_e32 v0, v2
+; GCN-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v2
+; GCN-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s7, s7, s4
 ; GCN-NEXT:    s_and_b32 s6, s6, s5
 ; GCN-NEXT:    s_sub_u32 s8, 0, s2
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
+; GCN-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    s_subb_u32 s9, 0, s3
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s8, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s8, v1
+; GCN-NEXT:    v_mul_lo_u32 v5, s9, v1
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_mul_lo_u32 v4, s8, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s8, v1
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, v1, v4
 ; GCN-NEXT:    v_mul_hi_u32 v10, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
@@ -749,18 +749,18 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v3
+; GCN-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v5, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[2:3]
 ; GCN-NEXT:    v_mul_lo_u32 v5, s8, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, s8, v0
-; GCN-NEXT:    v_mul_lo_u32 v7, s9, v0
+; GCN-NEXT:    v_mul_hi_u32 v6, s8, v1
+; GCN-NEXT:    v_mul_lo_u32 v7, s9, v1
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, s8, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s8, v1
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT:    v_mul_lo_u32 v11, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v13, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
+; GCN-NEXT:    v_mul_lo_u32 v11, v1, v5
+; GCN-NEXT:    v_mul_hi_u32 v13, v1, v5
+; GCN-NEXT:    v_mul_hi_u32 v12, v1, v6
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v6
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v7, v3, v5
@@ -774,50 +774,50 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[2:3]
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_mov_b32_e32 v3, s6
 ; GCN-NEXT:    v_alignbit_b32 v3, s7, v3, 24
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_mul_hi_u32 v5, v3, v0
+; GCN-NEXT:    v_mul_hi_u32 v5, v3, v1
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v6, v3, v2
-; GCN-NEXT:    v_mul_hi_u32 v0, 0, v0
+; GCN-NEXT:    v_mul_hi_u32 v1, 0, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, 0, v2
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 0, v4
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v5, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v8, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0, v0
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, v1, v2
-; GCN-NEXT:    v_mul_hi_u32 v5, v1, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, v1, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v1
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v1
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_subb_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v3, v1
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v3, v0
 ; GCN-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 2, v1
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v1
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v0
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -1, v1, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v8, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v5, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v7, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -827,15 +827,15 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xc
-; GCN-IR-NEXT:    s_load_dword s7, s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0xffff
-; GCN-IR-NEXT:    s_mov_b32 s6, 0xff000000
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xff000000
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_and_b32 s3, s3, s4
-; GCN-IR-NEXT:    s_and_b32 s2, s2, s6
+; GCN-IR-NEXT:    s_and_b32 s2, s2, s7
 ; GCN-IR-NEXT:    s_and_b32 s5, s5, s4
-; GCN-IR-NEXT:    s_and_b32 s4, s7, s6
+; GCN-IR-NEXT:    s_and_b32 s4, s6, s7
 ; GCN-IR-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
 ; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[4:5], 24
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 0ff21573c927..bc77c295d4d5 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -108,12 +108,12 @@ define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v2, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
+; GFX9-NEXT:    v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
@@ -128,11 +128,11 @@ define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -415,12 +415,11 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v4
+; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
   %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -809,14 +808,14 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_endpgm
   %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16

diff  --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 4beebc0b34a2..b35180303e02 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -272,17 +272,17 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)
 ; VI-LABEL: no_widen_i16_constant_divergent_load:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_ushort v2, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v2, 0x3e7, v2
-; VI-NEXT:    v_or_b32_e32 v2, 4, v2
+; VI-NEXT:    v_add_u16_e32 v0, 0x3e7, v0
+; VI-NEXT:    v_or_b32_e32 v2, 4, v0
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll
index e29e67f77d2e..9005e3f6498d 100644
--- a/llvm/test/CodeGen/AMDGPU/xor3.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor3.ll
@@ -155,12 +155,12 @@ define amdgpu_ps float @xor3_uniform_vgpr(float inreg %a, float inreg %b, float
 ;
 ; GFX10-LABEL: xor3_uniform_vgpr:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_f32_e64 v0, s2, 1.0
 ; GFX10-NEXT:    v_add_f32_e64 v1, s3, 2.0
-; GFX10-NEXT:    v_add_f32_e64 v2, s2, 1.0
-; GFX10-NEXT:    v_add_f32_e64 v0, 0x40400000, s4
+; GFX10-NEXT:    v_add_f32_e64 v2, 0x40400000, s4
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    ; return to shader part epilog
   %a1 = fadd float %a, 1.0
   %b2 = fadd float %b, 2.0


        


More information about the llvm-commits mailing list