[llvm] 0a77699 - Revert "[DAG] Attempt shl narrowing in SimplifyDemandedBits"

Kirill Stoimenov via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 4 15:16:06 PDT 2023


Author: Kirill Stoimenov
Date: 2023-10-04T22:15:41Z
New Revision: 0a776996af6925e693544991581eb723e06e4e68

URL: https://github.com/llvm/llvm-project/commit/0a776996af6925e693544991581eb723e06e4e68
DIFF: https://github.com/llvm/llvm-project/commit/0a776996af6925e693544991581eb723e06e4e68.diff

LOG: Revert "[DAG] Attempt shl narrowing in SimplifyDemandedBits"

This reverts commit 7a8c04ef84ecdab4390b451d4c2fe17bc45a7b63.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
    llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
    llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
    llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
    llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
    llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
    llvm/test/CodeGen/AMDGPU/xnor.ll
    llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
    llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
    llvm/test/CodeGen/X86/avx512vnni-combine.ll
    llvm/test/CodeGen/X86/avxvnni-combine.ll
    llvm/test/CodeGen/X86/bswap.ll
    llvm/test/CodeGen/X86/buildvec-insertvec.ll
    llvm/test/CodeGen/X86/cmp-concat.ll
    llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
    llvm/test/CodeGen/X86/combine-bitreverse.ll
    llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
    llvm/test/CodeGen/X86/dagcombine-shifts.ll
    llvm/test/CodeGen/X86/divmod128.ll
    llvm/test/CodeGen/X86/extract-bits.ll
    llvm/test/CodeGen/X86/fold-and-shift.ll
    llvm/test/CodeGen/X86/fp128-i128.ll
    llvm/test/CodeGen/X86/lea-dagdag.ll
    llvm/test/CodeGen/X86/lea-opt2.ll
    llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
    llvm/test/CodeGen/X86/parity.ll
    llvm/test/CodeGen/X86/pr62653.ll
    llvm/test/CodeGen/X86/select.ll
    llvm/test/CodeGen/X86/select_const.ll
    llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
    llvm/test/CodeGen/X86/setcc.ll
    llvm/test/CodeGen/X86/shift-combine.ll
    llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
    llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
    llvm/test/CodeGen/X86/vselect.ll
    llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
    llvm/test/CodeGen/X86/zext-shl.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7a897d2a2486a08..c0e88051dc427f1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1784,38 +1784,6 @@ bool TargetLowering::SimplifyDemandedBits(
         }
       }
 
-      // Narrow shift to lower half - similar to ShrinkDemandedOp.
-      // (shl i64:x, K) -> (i64 zero_extend (shl (i32 (trunc i64:x)), K))
-      unsigned HalfWidth = BitWidth / 2;
-      if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth) {
-        EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
-        if (isNarrowingProfitable(VT, HalfVT) &&
-            isTypeDesirableForOp(ISD::SHL, HalfVT) &&
-            isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
-            (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, VT))) {
-          // Unless we aren't demanding the upper bits at all, we must ensure
-          // that the upper bits of the shift result are known to be zero,
-          // which is equivalent to the narrow shift being NUW.
-          KnownBits Known0 = TLO.DAG.computeKnownBits(Op0, Depth + 1);
-          bool IsNUW = Known0.countMinLeadingZeros() >= (ShAmt + HalfWidth);
-          if (IsNUW || DemandedBits.countLeadingZeros() >= HalfWidth) {
-            unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0, Depth + 1);
-            bool IsNSW = NumSignBits > (ShAmt + HalfWidth);
-            SDNodeFlags Flags;
-            Flags.setNoSignedWrap(IsNSW);
-            Flags.setNoUnsignedWrap(IsNUW);
-            SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0);
-            SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant(
-                ShAmt, HalfVT, dl, TLO.LegalTypes());
-            SDValue NewShift = TLO.DAG.getNode(ISD::SHL, dl, HalfVT, NewOp,
-                                               NewShiftAmt, Flags);
-            SDValue NewExt =
-                TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, NewShift);
-            return TLO.CombineTo(Op, NewExt);
-          }
-        }
-      }
-
       APInt InDemandedMask = DemandedBits.lshr(ShAmt);
       if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
                                Depth + 1))

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index d7b9eebff77c039..181b5b71bdd485b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -32,8 +32,7 @@ define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %res
 ; the base may be the RHS operand of the load in SDAG.
 ; GCN-LABEL: name: test_complex_reg_offset
 ; GCN-DAG: %[[BASE:.*]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @0 + 4,
-; SDAG-DAG: %[[OFFSET:.*]]:sreg_32 = nuw nsw S_LSHL_B32
-; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = S_LSHL_B32
+; GCN-DAG: %[[OFFSET:.*]]:sreg_32 = S_LSHL_B32
 ; SDAG: S_LOAD_DWORD_SGPR_IMM killed %[[BASE]], killed %[[OFFSET]], 0, 0
 ; GISEL: S_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 0, 0
 define amdgpu_ps void @test_complex_reg_offset(ptr addrspace(1) %out) {

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 81d2aa3e9510a99..1b8216f4aa2a61f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -4669,49 +4669,49 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    s_and_b32 s4, s6, 0x7fff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT:    s_bfe_u32 s4, s8, 0xf000f
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
+; GFX6-NEXT:    s_bfe_u32 s5, s6, 0xf000f
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    s_bfe_u32 s4, s8, 0xf000f
 ; GFX6-NEXT:    v_alignbit_b32 v2, s9, v2, 30
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s5
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    s_bfe_u32 s5, s6, 0xf000f
 ; GFX6-NEXT:    v_alignbit_b32 v0, s7, v0, 30
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mad_f32 v6, -v1, v5, v6
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v7, v1
-; GFX6-NEXT:    v_mul_f32_e32 v1, v0, v4
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_mad_f32 v4, -v1, v5, v6
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v1, v0, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v1
 ; GFX6-NEXT:    v_mad_f32 v0, -v1, v2, v0
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v4, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v3
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v7, vcc
-; GFX6-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX6-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 15, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_v3i15:
@@ -4724,48 +4724,48 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_and_b32 s1, s2, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    s_bfe_u32 s0, s2, 0xf000f
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT:    s_bfe_u32 s1, s6, 0xf000f
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_alignbit_b32 v3, s3, v3, 30
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    s_bfe_u32 s1, s6, 0xf000f
 ; GFX9-NEXT:    v_alignbit_b32 v0, s7, v0, 30
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mad_f32 v7, -v1, v6, v7
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v0, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mad_f32 v5, -v1, v6, v7
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v1, v0, v7
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
 ; GFX9-NEXT:    v_mad_f32 v0, -v1, v3, v0
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
-; GFX9-NEXT:    global_store_short v2, v1, s[4:5] offset:4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
-; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
+; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <3 x i15> %x, %y
   store <3 x i15> %r, ptr addrspace(1) %out
@@ -4850,63 +4850,63 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_alignbit_b32 v0, s7, v0, 30
+; GFX6-NEXT:    s_and_b32 s7, s8, 0x7fff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_and_b32 s10, s8, 0x7fff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
 ; GFX6-NEXT:    s_and_b32 s5, s6, 0x7fff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s5
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GFX6-NEXT:    v_alignbit_b32 v2, s9, v2, 30
-; GFX6-NEXT:    s_bfe_u32 s9, s8, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s9
+; GFX6-NEXT:    s_bfe_u32 s5, s8, 0xf000f
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s5
+; GFX6-NEXT:    s_bfe_u32 s7, s6, 0xf000f
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX6-NEXT:    v_alignbit_b32 v0, s7, v0, 30
-; GFX6-NEXT:    s_bfe_u32 s7, s6, 0xf000f
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s7
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, s6, v1
+; GFX6-NEXT:    v_alignbit_b32 v2, s9, v2, 30
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s6, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v8, v4, v8
-; GFX6-NEXT:    v_trunc_f32_e32 v8, v8
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v9, v8
-; GFX6-NEXT:    v_mad_f32 v4, -v8, v6, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v6
-; GFX6-NEXT:    s_lshr_b32 s5, s8, 15
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v4, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
+; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s5
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
+; GFX6-NEXT:    s_lshr_b32 s5, s8, 15
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s5
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    s_lshr_b32 s4, s6, 15
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v3
-; GFX6-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX6-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 15, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: urem_v3i15:
@@ -4915,60 +4915,60 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s3, s6, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GFX9-NEXT:    s_and_b32 s8, s0, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX9-NEXT:    s_bfe_u32 s3, s0, 0xf000f
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_alignbit_b32 v3, s1, v3, 30
 ; GFX9-NEXT:    v_alignbit_b32 v0, s7, v0, 30
+; GFX9-NEXT:    s_and_b32 s7, s0, 0x7fff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s2, s6, 0x7fff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT:    s_bfe_u32 s2, s0, 0xf000f
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_alignbit_b32 v3, s1, v3, 30
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    s_bfe_u32 s7, s6, 0xf000f
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
+; GFX9-NEXT:    s_bfe_u32 s3, s6, 0xf000f
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mad_f32 v7, -v5, v6, v7
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_mul_f32_e32 v9, v8, v9
-; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v9
-; GFX9-NEXT:    v_mad_f32 v8, -v9, v4, v8
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v8|, v4
-; GFX9-NEXT:    s_lshr_b32 s1, s0, 15
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v10, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v4, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_mad_f32 v7, -v4, v6, v7
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v6, v8, v9
+; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v8
+; GFX9-NEXT:    s_lshr_b32 s1, s0, 15
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s1
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    s_lshr_b32 s2, s6, 15
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v5, v3
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 15
+; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v1
+; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
-; GFX9-NEXT:    global_store_short v2, v1, s[4:5] offset:4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
-; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
+; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
   %r = urem <3 x i15> %x, %y
   store <3 x i15> %r, ptr addrspace(1) %out
@@ -5093,35 +5093,35 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 15
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v2|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, v1
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v0
-; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s4, v5
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
+; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v6
+; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mad_f32 v4, -v1, v2, v4
+; GFX6-NEXT:    v_mad_f32 v5, -v1, v2, v5
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v2|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v5
-; GFX6-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX6-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 15, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v3i15:
@@ -5160,36 +5160,36 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mad_f32 v5, -v6, v3, v5
 ; GFX9-NEXT:    s_or_b32 s2, s0, 1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_alignbit_b32 v0, s7, v0, 30
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, v0
+; GFX9-NEXT:    v_add_u32_e32 v5, s0, v6
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v7
+; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v1
-; GFX9-NEXT:    v_mad_f32 v1, -v1, v3, v5
-; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_mad_f32 v1, -v1, v3, v6
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    v_add_u32_e32 v0, v7, v0
-; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v3, s0, v6
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
-; GFX9-NEXT:    global_store_short v2, v1, s[4:5] offset:4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
-; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
+; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <3 x i15> %x, %y
   store <3 x i15> %r, ptr addrspace(1) %out
@@ -5325,40 +5325,40 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 15
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s6, v4
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v7, v7
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v6|, |v5|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v2
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v2
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v5
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, s4, v7
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v7, v7
-; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v8
+; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_mul_f32_e32 v2, v7, v8
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX6-NEXT:    v_mad_f32 v6, -v2, v5, v6
+; GFX6-NEXT:    v_mad_f32 v7, -v2, v6, v7
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v5|
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s9
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v7
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s9
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v5
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX6-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 15, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_v3i15:
@@ -5369,74 +5369,74 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_i32 s1, s6, 0xf0000
 ; GFX9-NEXT:    s_bfe_i32 s0, s2, 0xf0000
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, s1
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s1
 ; GFX9-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    s_lshr_b32 s8, s6, 15
-; GFX9-NEXT:    v_mul_f32_e32 v7, v6, v7
-; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX9-NEXT:    v_mad_f32 v6, -v7, v5, v6
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v7
+; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
+; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
 ; GFX9-NEXT:    v_alignbit_b32 v0, s7, v0, 30
 ; GFX9-NEXT:    v_alignbit_b32 v1, s3, v1, 30
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 15
 ; GFX9-NEXT:    s_or_b32 s7, s0, 1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v6|, |v5|
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cselect_b32 s0, s7, 0
-; GFX9-NEXT:    v_add_u32_e32 v5, s0, v7
+; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
 ; GFX9-NEXT:    s_bfe_i32 s0, s2, 0xf000f
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, s0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
 ; GFX9-NEXT:    s_bfe_i32 s1, s6, 0xf000f
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v7, s1
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, s1
 ; GFX9-NEXT:    s_xor_b32 s0, s1, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v1
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v1
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
-; GFX9-NEXT:    v_mul_f32_e32 v8, v7, v8
-; GFX9-NEXT:    v_trunc_f32_e32 v8, v8
-; GFX9-NEXT:    v_mad_f32 v7, -v8, v6, v7
-; GFX9-NEXT:    v_mul_lo_u32 v5, v5, s2
+; GFX9-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
+; GFX9-NEXT:    v_mad_f32 v6, -v7, v5, v6
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v7
+; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s2
 ; GFX9-NEXT:    s_or_b32 s2, s0, 1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v7|, |v6|
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v6|, |v5|
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v0
-; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v7, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v6
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v8, v8
-; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v9
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v9, v1
-; GFX9-NEXT:    v_mad_f32 v1, -v1, v6, v7
-; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v6|
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    v_add_u32_e32 v0, v9, v0
 ; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v4
-; GFX9-NEXT:    v_add_u32_e32 v1, s0, v8
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v5
-; GFX9-NEXT:    v_sub_u32_e32 v0, v3, v0
+; GFX9-NEXT:    v_add_u32_e32 v5, s0, v7
+; GFX9-NEXT:    v_bfe_i32 v7, v0, 0, 15
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v8, v7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v6
+; GFX9-NEXT:    v_xor_b32_e32 v1, v7, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
+; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v7, v8, v9
+; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v9, v7
+; GFX9-NEXT:    v_mad_f32 v7, -v7, v6, v8
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, v5, s3
+; GFX9-NEXT:    v_add_u32_e32 v1, v9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v4
+; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v5
+; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
-; GFX9-NEXT:    global_store_short v2, v1, s[4:5] offset:4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
-; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
+; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
   %r = srem <3 x i15> %x, %y
   store <3 x i15> %r, ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index b5a823355543c9c..5990736f664fbe8 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -37,43 +37,44 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
-  ; GFX90A-NEXT:   $vgpr22 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $vgpr10 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $vgpr24 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $agpr0 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $vgpr26 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $vgpr20 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $vgpr22 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.58, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr23 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr23 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr25 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr27 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.3.Flow17:
   ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.57(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr4 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.57, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.4.bb15:
   ; GFX90A-NEXT:   successors: %bb.35(0x40000000), %bb.5(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr4 = COPY renamable $sgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr25, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr4, killed $vgpr1, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr30, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr5, killed $vgpr1, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr0 = V_LSHLREV_B32_e32 2, $vgpr4, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -81,7 +82,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.5:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
@@ -94,9 +95,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
@@ -104,32 +105,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr56_vgpr57 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr44_vgpr45 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr42_vgpr43 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.6.Flow20:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr19 = COPY renamable $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr18 = COPY $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr21 = COPY $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr21 = COPY renamable $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr20 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr23 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr22 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr25 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr24 = COPY $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr27 = COPY $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr26 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.7.Flow19:
   ; GFX90A-NEXT:   successors: %bb.62(0x40000000), %bb.8(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
   ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -137,7 +138,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.8.Flow32:
   ; GFX90A-NEXT:   successors: %bb.9(0x40000000), %bb.10(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -146,15 +147,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.9.bb89:
   ; GFX90A-NEXT:   successors: %bb.10(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.10.Flow33:
   ; GFX90A-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -163,15 +164,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.11.bb84:
   ; GFX90A-NEXT:   successors: %bb.12(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.12.Flow34:
   ; GFX90A-NEXT:   successors: %bb.13(0x40000000), %bb.14(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -180,10 +181,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.13.bb79:
   ; GFX90A-NEXT:   successors: %bb.14(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.14.Flow35:
@@ -365,7 +366,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.35.bb20:
   ; GFX90A-NEXT:   successors: %bb.37(0x40000000), %bb.36(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
@@ -382,37 +383,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr58_vgpr59 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr56_vgpr57 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr44_vgpr45 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.37, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.36.Flow21:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.6
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.37.bb27:
   ; GFX90A-NEXT:   successors: %bb.39(0x40000000), %bb.38(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec
@@ -420,29 +421,29 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr58_vgpr59 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr56_vgpr57 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.39, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.38.Flow22:
   ; GFX90A-NEXT:   successors: %bb.36(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -463,7 +464,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.39.bb34:
   ; GFX90A-NEXT:   successors: %bb.41(0x40000000), %bb.40(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec
@@ -471,28 +472,28 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr58_vgpr59 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.41, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.40.Flow23:
   ; GFX90A-NEXT:   successors: %bb.38(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -512,7 +513,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.41.bb41:
   ; GFX90A-NEXT:   successors: %bb.46(0x40000000), %bb.42(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
@@ -521,33 +522,33 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr20, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.46, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.42.Flow24:
   ; GFX90A-NEXT:   successors: %bb.40(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr59 = COPY killed renamable $vgpr20, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
@@ -564,7 +565,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.43.bb55:
   ; GFX90A-NEXT:   successors: %bb.48(0x40000000), %bb.44(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
@@ -572,33 +573,33 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc
-  ; GFX90A-NEXT:   $vgpr10 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $agpr0 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.48, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.44:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr18, $vgpr30, $vgpr31, $vgpr61, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr10, $vgpr12
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr20, $vgpr61, $vgpr31, $vgpr63, $agpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr4, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr14
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.45.Flow26:
   ; GFX90A-NEXT:   successors: %bb.47(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
@@ -614,7 +615,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.46.bb48:
   ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.47(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
@@ -628,26 +629,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.43, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.47.Flow25:
   ; GFX90A-NEXT:   successors: %bb.42(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -665,135 +666,133 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.48.bb63:
   ; GFX90A-NEXT:   successors: %bb.50(0x40000000), %bb.49(0x40000000)
-  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.50, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.49:
   ; GFX90A-NEXT:   successors: %bb.44(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 -1
   ; GFX90A-NEXT:   S_BRANCH %bb.44
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.50.bb68:
   ; GFX90A-NEXT:   successors: %bb.54(0x40000000), %bb.51(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, $vgpr4_vgpr5, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.54, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.51:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.52.bb80:
   ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.53(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr9, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.59, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.53:
   ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_BRANCH %bb.61
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.54.bb73:
   ; GFX90A-NEXT:   successors: %bb.52(0x40000000), %bb.55(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
-  ; GFX90A-NEXT:   renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
+  ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.52, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.55.Flow29:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.56.bb90:
   ; GFX90A-NEXT:   successors: %bb.60(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr56, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr10, 1, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr56, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr5, 1, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr30 = V_ALIGNBIT_B32_e64 $vgpr19, $vgpr18, 1, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr17 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.60
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.57:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr23, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr19 = COPY killed renamable $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
@@ -804,9 +803,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
@@ -816,12 +815,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr42_vgpr43 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr40_vgpr41 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr46_vgpr47 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = COPY renamable $vgpr15, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr52 = COPY renamable $vgpr15, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr16 = COPY renamable $vgpr15, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr53 = COPY renamable $vgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr13 = COPY renamable $vgpr15, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr12 = COPY renamable $vgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr16 = COPY renamable $vgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr30 = COPY renamable $vgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr18 = COPY renamable $vgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr54 = COPY renamable $vgpr19, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr15 = COPY renamable $vgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr14 = COPY renamable $vgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.7
   ; GFX90A-NEXT: {{  $}}
@@ -830,15 +829,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3)
+  ; GFX90A-NEXT:   renamable $agpr0_agpr1 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr23 = S_MOV_B32 0
   ; GFX90A-NEXT:   renamable $sgpr17 = S_MOV_B32 0
@@ -846,37 +845,38 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.59.bb85:
   ; GFX90A-NEXT:   successors: %bb.56(0x40000000), %bb.60(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
+  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr5 = FLAT_LOAD_UBYTE renamable $vgpr10_vgpr11, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
   ; GFX90A-NEXT:   renamable $sgpr17 = S_MOV_B32 0
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.56, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.60.Flow31:
   ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr12 = COPY renamable $vgpr16, implicit $exec
+  ; GFX90A-NEXT:   renamable $agpr0_agpr1 = COPY killed renamable $vgpr12_vgpr13, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.61.Flow30:
   ; GFX90A-NEXT:   successors: %bb.55(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
@@ -888,7 +888,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.62.bb140:
   ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.63(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -896,120 +896,122 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.63.Flow13:
   ; GFX90A-NEXT:   successors: %bb.64(0x40000000), %bb.66(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.66, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.64.bb159:
   ; GFX90A-NEXT:   successors: %bb.67(0x40000000), %bb.65(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr4, implicit $exec
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.67, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.65.Flow10:
   ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.66.Flow14:
   ; GFX90A-NEXT:   successors: %bb.8(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = COPY $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.8
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.67.bb161:
   ; GFX90A-NEXT:   successors: %bb.65(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr23, killed $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr3 = COPY killed renamable $agpr1, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr3, killed $vgpr21, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr54, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr4 = V_OR_B32_e32 killed $vgpr30, killed $vgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr4, killed $vgpr2, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr19, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr17, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
   ; GFX90A-NEXT:   S_BRANCH %bb.65
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.68.bb174:
   ; GFX90A-NEXT:   successors: %bb.72(0x40000000), %bb.69(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 $vgpr38, $vgpr20, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr36, $vgpr10, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr32 = V_OR_B32_e32 $vgpr48, $vgpr12, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $vgpr28 = V_OR_B32_e32 1, $vgpr26, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr24, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr32 = V_CNDMASK_B32_e64 0, $vgpr36, 0, 0, $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr50 = V_OR_B32_e32 $vgpr32, $vgpr20, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = COPY renamable $agpr0_agpr1, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr50, killed $vgpr12, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr14, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr52 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr34, killed $sgpr12_sgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.69.Flow:
   ; GFX90A-NEXT:   successors: %bb.70(0x40000000), %bb.71(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.71, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.70.bb186:
   ; GFX90A-NEXT:   successors: %bb.71(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr5, killed $vgpr3, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr29 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr29, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr37 = COPY renamable $vgpr29, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr51 = COPY renamable $vgpr29, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr29, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr33 = COPY renamable $vgpr29, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr53 = COPY renamable $vgpr29, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr29, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr29, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.71.Flow9:
   ; GFX90A-NEXT:   successors: %bb.63(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.63
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.72.bb196:
   ; GFX90A-NEXT:   successors: %bb.69(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr5 = V_OR_B32_e32 $vgpr52, killed $vgpr18, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr12 = V_OR_B32_e32 killed $vgpr5, killed $vgpr16, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr13, renamable $vgpr12_vgpr13, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.69
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 3207f9e61e3140b..73d5088141cdb31 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -449,14 +449,14 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
-; GCN-O0-NEXT:    s_mov_b32 s4, 2
-; GCN-O0-NEXT:    v_lshlrev_b32_e64 v3, s4, v1
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
+; GCN-O0-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-O0-NEXT:    s_mov_b32 s4, 2
+; GCN-O0-NEXT:    v_lshl_b64 v[3:4], v[2:3], s4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
@@ -684,14 +684,15 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
 ; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b32 s0, 2
-; GCN-O0-NEXT:    v_lshlrev_b32_e64 v3, s0, v1
-; GCN-O0-NEXT:    s_mov_b32 s1, 0
-; GCN-O0-NEXT:    ; implicit-def: $sgpr1
+; GCN-O0-NEXT:    s_mov_b32 s0, 0
+; GCN-O0-NEXT:    ; implicit-def: $sgpr0
+; GCN-O0-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GCN-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-O0-NEXT:    s_mov_b32 s0, 2
+; GCN-O0-NEXT:    s_mov_b32 s1, s0
+; GCN-O0-NEXT:    v_lshl_b64 v[3:4], v[2:3], s1
 ; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-O0-NEXT:    s_mov_b32 s2, s4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 8382624af5d749e..64060ebbb159ec5 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -646,8 +646,10 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: udiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_movk_i32 s4, 0x400
+; GFX9-NEXT:    s_movk_i32 s6, 0x400
+; GFX9-NEXT:    s_mov_b32 s7, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
@@ -655,19 +657,22 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB4_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
-; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], 0, v6, s[0:1]
+; GFX9-NEXT:    s_and_b32 s4, 0xffff, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GFX9-NEXT:    v_add_u16_e64 v3, s7, 1
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s6, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v3
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[4:5], 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v3, v4, s[2:3]
+; GFX9-NEXT:    s_add_u32 s8, s2, s0
+; GFX9-NEXT:    v_mad_f32 v3, -v3, v0, v4
+; GFX9-NEXT:    s_addc_u32 s9, s3, s1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], 0, v5, s[0:1]
+; GFX9-NEXT:    global_store_short v2, v3, s[8:9]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB4_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -678,25 +683,30 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB4_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX10-NEXT:    v_add_nc_u16 v3, s4, 1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, s0
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[0:1], 1
+; GFX10-NEXT:    s_add_u32 s6, s2, s4
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v1
+; GFX10-NEXT:    s_addc_u32 s7, s3, s5
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX10-NEXT:    v_mad_f32 v4, -v3, v0, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v5, s0
-; GFX10-NEXT:    global_store_short v3, v4, s[2:3]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, 0, v3, s0
+; GFX10-NEXT:    global_store_short v2, v3, s[6:7]
 ; GFX10-NEXT:    s_cbranch_vccz .LBB4_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -707,31 +717,36 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB4_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX11-NEXT:    v_add_nc_u16 v3, s4, 1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, s0
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[0:1], 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_add_u32 s6, s2, s4
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v1
+; GFX11-NEXT:    s_addc_u32 s7, s3, s5
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX11-NEXT:    v_fma_f32 v4, -v5, v0, v4
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX11-NEXT:    v_fma_f32 v4, -v3, v0, v4
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v5, s0
-; GFX11-NEXT:    global_store_b16 v3, v4, s[2:3]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, 0, v3, s0
+; GFX11-NEXT:    global_store_b16 v2, v3, s[6:7]
 ; GFX11-NEXT:    s_cbranch_vccz .LBB4_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
@@ -758,31 +773,33 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: urem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_movk_i32 s5, 0x400
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_movk_i32 s7, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    s_and_b32 s6, s2, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX9-NEXT:  .LBB5_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v6, v4, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GFX9-NEXT:    v_mad_f32 v4, -v6, v0, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, v0
-; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s4
-; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v4
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v5, v3, s[2:3]
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
+; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v7, s5
+; GFX9-NEXT:    v_mul_f32_e32 v9, v8, v3
+; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v9
+; GFX9-NEXT:    v_mad_f32 v8, -v9, v2, v8
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v8|, v2
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s7, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v8, v8, s6
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v8
+; GFX9-NEXT:    global_store_short v[5:6], v0, off
 ; GFX9-NEXT:    s_cbranch_vccz .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -792,27 +809,30 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX10-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX10-NEXT:  .LBB5_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v4, v4, s0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
-; GFX10-NEXT:    global_store_short v5, v3, s[2:3]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX10-NEXT:    v_add_nc_u16 v4, v4, 1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
+; GFX10-NEXT:    v_mul_f32_e32 v8, v7, v3
+; GFX10-NEXT:    v_add_co_u32 v5, s0, s2, v5
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
+; GFX10-NEXT:    v_trunc_f32_e32 v8, v8
+; GFX10-NEXT:    v_mad_f32 v7, -v8, v2, v7
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v7|, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
+; GFX10-NEXT:    v_mul_lo_u32 v7, v7, s1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
+; GFX10-NEXT:    global_store_short v[5:6], v0, off
 ; GFX10-NEXT:    s_cbranch_vccz .LBB5_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -820,36 +840,38 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-LABEL: urem16_invariant_denom:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, s1
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB5_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX11-NEXT:    v_add_nc_u16 v4, v4, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX11-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f32 v4, -v5, v0, v4
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 1, v3
-; GFX11-NEXT:    v_mul_lo_u32 v4, v4, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
-; GFX11-NEXT:    global_store_b16 v5, v3, s[0:1]
+; GFX11-NEXT:    v_mul_f32_e32 v8, v7, v3
+; GFX11-NEXT:    v_add_co_u32 v5, s0, s2, v5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_trunc_f32_e32 v8, v8
+; GFX11-NEXT:    v_fma_f32 v7, -v8, v2, v7
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v7|, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_lo_u32 v7, v7, s1
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
+; GFX11-NEXT:    global_store_b16 v[5:6], v0, off
 ; GFX11-NEXT:    s_cbranch_vccz .LBB5_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
@@ -876,35 +898,38 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: sdiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s3, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_movk_i32 s5, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT:    s_sext_i32_i16 s4, s2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB6_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_sext_i32_i16 s5, s4
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT:    s_xor_b32 s6, s5, s2
-; GFX9-NEXT:    s_ashr_i32 s5, s6, 30
-; GFX9-NEXT:    s_or_b32 s5, s5, 1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
-; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT:    v_add_u16_e64 v2, s4, 1
-; GFX9-NEXT:    s_cselect_b32 s5, s5, 0
-; GFX9-NEXT:    s_and_b32 s6, 0xffff, s4
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, s5, v4
-; GFX9-NEXT:    s_lshl_b32 s5, s6, 1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    global_store_short v3, v2, s[0:1]
+; GFX9-NEXT:    s_sext_i32_i16 s2, s6
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s2
+; GFX9-NEXT:    s_xor_b32 s7, s2, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s7, 30
+; GFX9-NEXT:    s_or_b32 s2, s2, 1
+; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v4|, |v0|
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX9-NEXT:    s_cselect_b32 s7, s2, 0
+; GFX9-NEXT:    s_and_b32 s2, s6, 0xffff
+; GFX9-NEXT:    v_add_u16_e64 v3, s6, 1
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX9-NEXT:    s_add_u32 s8, s0, s8
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9-NEXT:    v_add_u32_e32 v3, s7, v5
+; GFX9-NEXT:    s_addc_u32 s9, s1, s9
+; GFX9-NEXT:    global_store_short v2, v3, s[8:9]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -914,33 +939,36 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s0, s4
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX10-NEXT:    s_sext_i32_i16 s4, s4
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s4
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB6_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_sext_i32_i16 s4, s1
-; GFX10-NEXT:    v_add_nc_u16 v2, s1, 1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX10-NEXT:    s_xor_b32 s5, s4, s0
-; GFX10-NEXT:    s_ashr_i32 s4, s5, 30
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v1
-; GFX10-NEXT:    s_or_b32 s4, s4, 1
-; GFX10-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX10-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s5, |v3|, |v0|
-; GFX10-NEXT:    s_and_b32 s5, s5, exec_lo
-; GFX10-NEXT:    s_cselect_b32 s4, s4, 0
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s1
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, s4, v4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    global_store_short v3, v2, s[2:3]
+; GFX10-NEXT:    s_sext_i32_i16 s0, s5
+; GFX10-NEXT:    v_add_nc_u16 v3, s5, 1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, s0
+; GFX10-NEXT:    s_xor_b32 s0, s0, s4
+; GFX10-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
+; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX10-NEXT:    s_or_b32 s0, s0, 1
+; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s6, |v4|, |v0|
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v4, v5
+; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s6, s0, 0
+; GFX10-NEXT:    s_and_b32 s0, s5, 0xffff
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, s6, v4
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
+; GFX10-NEXT:    s_add_u32 s6, s2, s6
+; GFX10-NEXT:    s_addc_u32 s7, s3, s7
+; GFX10-NEXT:    global_store_short v2, v3, s[6:7]
 ; GFX10-NEXT:    s_cbranch_vccz .LBB6_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -950,39 +978,43 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_mov_b32 s3, 0
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i16 s2, s2
+; GFX11-NEXT:    s_sext_i32_i16 s4, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s4
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB6_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_sext_i32_i16 s4, s3
-; GFX11-NEXT:    v_add_nc_u16 v2, s3, 1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX11-NEXT:    s_xor_b32 s5, s4, s2
+; GFX11-NEXT:    s_sext_i32_i16 s2, s5
+; GFX11-NEXT:    v_add_nc_u16 v3, s5, 1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, s2
+; GFX11-NEXT:    s_xor_b32 s2, s2, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_ashr_i32 s4, s5, 30
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
+; GFX11-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v1
-; GFX11-NEXT:    s_or_b32 s4, s4, 1
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX11-NEXT:    s_or_b32 s2, s2, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX11-NEXT:    v_fma_f32 v3, -v4, v0, v3
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s5, |v3|, |v0|
-; GFX11-NEXT:    s_and_b32 s5, s5, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s4, s4, 0
-; GFX11-NEXT:    s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX11-NEXT:    v_fma_f32 v4, -v5, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s6, |v4|, |v0|
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v4, v5
+; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s6, s2, 0
+; GFX11-NEXT:    s_and_b32 s2, s5, 0xffff
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, s6, v4
+; GFX11-NEXT:    s_lshl_b64 s[6:7], s[2:3], 1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4
-; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
+; GFX11-NEXT:    s_add_u32 s6, s0, s6
+; GFX11-NEXT:    s_addc_u32 s7, s1, s7
+; GFX11-NEXT:    global_store_b16 v2, v3, s[6:7]
 ; GFX11-NEXT:    s_cbranch_vccz .LBB6_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
@@ -1009,37 +1041,40 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: srem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_movk_i32 s3, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_movk_i32 s5, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT:    s_sext_i32_i16 s4, s2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB7_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_sext_i32_i16 s5, s4
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT:    s_xor_b32 s6, s5, s2
-; GFX9-NEXT:    s_ashr_i32 s6, s6, 30
-; GFX9-NEXT:    s_or_b32 s8, s6, 1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
-; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT:    v_add_u16_e64 v2, s4, 1
-; GFX9-NEXT:    s_cselect_b32 s6, s8, 0
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT:    s_and_b32 s7, 0xffff, s4
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, s6, v4
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GFX9-NEXT:    s_lshl_b32 s6, s7, 1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
-; GFX9-NEXT:    global_store_short v3, v2, s[0:1]
+; GFX9-NEXT:    s_sext_i32_i16 s7, s6
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s7
+; GFX9-NEXT:    s_xor_b32 s2, s7, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    s_or_b32 s2, s2, 1
+; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v4|, |v0|
+; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX9-NEXT:    v_add_u16_e64 v3, s6, 1
+; GFX9-NEXT:    s_cselect_b32 s8, s2, 0
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX9-NEXT:    s_and_b32 s2, s6, 0xffff
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9-NEXT:    v_add_u32_e32 v3, s8, v5
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s4
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
+; GFX9-NEXT:    s_add_u32 s8, s0, s8
+; GFX9-NEXT:    s_addc_u32 s9, s1, s9
+; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
+; GFX9-NEXT:    global_store_short v2, v3, s[8:9]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -1049,36 +1084,38 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s0, s4
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX10-NEXT:    s_sext_i32_i16 s4, s4
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s4
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB7_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_sext_i32_i16 s4, s1
-; GFX10-NEXT:    v_add_nc_u16 v2, s1, 1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX10-NEXT:    s_xor_b32 s5, s4, s0
-; GFX10-NEXT:    s_ashr_i32 s5, s5, 30
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v1
-; GFX10-NEXT:    s_or_b32 s5, s5, 1
-; GFX10-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX10-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s6, |v3|, |v0|
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v4
+; GFX10-NEXT:    s_sext_i32_i16 s8, s5
+; GFX10-NEXT:    v_add_nc_u16 v3, s5, 1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, s8
+; GFX10-NEXT:    s_xor_b32 s0, s8, s4
+; GFX10-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
+; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX10-NEXT:    s_or_b32 s0, s0, 1
+; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s6, |v4|, |v0|
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v4, v5
 ; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
-; GFX10-NEXT:    s_cselect_b32 s5, s5, 0
-; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, s5, v3
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s1
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    v_mul_lo_u32 v3, v3, s0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s4, v3
-; GFX10-NEXT:    global_store_short v2, v3, s[2:3]
+; GFX10-NEXT:    s_cselect_b32 s6, s0, 0
+; GFX10-NEXT:    s_and_b32 s0, s5, 0xffff
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, s6, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
+; GFX10-NEXT:    s_add_u32 s6, s2, s6
+; GFX10-NEXT:    v_mul_lo_u32 v3, v4, s4
+; GFX10-NEXT:    s_addc_u32 s7, s3, s7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s8, v3
+; GFX10-NEXT:    global_store_short v2, v3, s[6:7]
 ; GFX10-NEXT:    s_cbranch_vccz .LBB7_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -1088,44 +1125,46 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_mov_b32 s3, 0
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i16 s2, s2
+; GFX11-NEXT:    s_sext_i32_i16 s4, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s4
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB7_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_sext_i32_i16 s4, s3
-; GFX11-NEXT:    v_add_nc_u16 v2, s3, 1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX11-NEXT:    s_xor_b32 s5, s4, s2
+; GFX11-NEXT:    s_sext_i32_i16 s8, s5
+; GFX11-NEXT:    v_add_nc_u16 v3, s5, 1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, s8
+; GFX11-NEXT:    s_xor_b32 s2, s8, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_ashr_i32 s5, s5, 30
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
+; GFX11-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v1
-; GFX11-NEXT:    s_or_b32 s5, s5, 1
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX11-NEXT:    s_or_b32 s2, s2, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX11-NEXT:    v_fma_f32 v3, -v4, v0, v3
+; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX11-NEXT:    v_fma_f32 v4, -v5, v0, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s6, |v3|, |v0|
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v4
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s6, |v4|, |v0|
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v4, v5
 ; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s5, s5, 0
-; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, s5, v3
-; GFX11-NEXT:    s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX11-NEXT:    v_mov_b32_e32 v2, s5
-; GFX11-NEXT:    v_mul_lo_u32 v3, v3, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, s4, v3
-; GFX11-NEXT:    global_store_b16 v2, v3, s[0:1]
+; GFX11-NEXT:    s_cselect_b32 s6, s2, 0
+; GFX11-NEXT:    s_and_b32 s2, s5, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, s6, v4
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX11-NEXT:    s_lshl_b64 s[6:7], s[2:3], 1
+; GFX11-NEXT:    s_add_u32 s6, s0, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_lo_u32 v3, v4, s4
+; GFX11-NEXT:    s_addc_u32 s7, s1, s7
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, s8, v3
+; GFX11-NEXT:    global_store_b16 v2, v3, s[6:7]
 ; GFX11-NEXT:    s_cbranch_vccz .LBB7_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 6f61af0418636a9..a462c19ce645d4a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -28,136 +28,138 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s35
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 3
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x800
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1000
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1800
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
 ; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
 ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2000
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2800
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
 ; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
 ; GFX8-NEXT:    s_movk_i32 s0, 0x3000
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x3800, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x3800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v11
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v11
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v12, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v13, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v13, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v15, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v15, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v17, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v17, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v18, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: clmem_read_simplified:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff8000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v18
-; GFX9-NEXT:    v_mov_b32_e32 v3, 3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s1, 0x2000
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
-; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
-; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
-; GFX9-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35]
-; GFX9-NEXT:    s_endpgm
+; GFX900-LABEL: clmem_read_simplified:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX900-NEXT:    s_mov_b32 s38, -1
+; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX900-NEXT:    s_add_u32 s36, s36, s3
+; GFX900-NEXT:    s_addc_u32 s37, s37, 0
+; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX900-NEXT:    s_getpc_b64 s[0:1]
+; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX900-NEXT:    v_mov_b32_e32 v31, v0
+; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_mov_b32 s32, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff8000, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, s35
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v18
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
+; GFX900-NEXT:    s_movk_i32 s1, 0x2000
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
+; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
+; GFX900-NEXT:    s_movk_i32 s0, 0x1000
+; GFX900-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048
+; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
+; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
+; GFX900-NEXT:    s_movk_i32 s0, 0x3000
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX900-NEXT:    s_waitcnt vmcnt(6)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(3)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX900-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35]
+; GFX900-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: clmem_read_simplified:
 ; GFX10:       ; %bb.0: ; %entry
@@ -179,14 +181,15 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 3
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v20
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 0x2000
@@ -231,6 +234,77 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
+; GFX90A-LABEL: clmem_read_simplified:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT:    s_mov_b32 s38, -1
+; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX90A-NEXT:    s_add_u32 s36, s36, s3
+; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
+; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX90A-NEXT:    s_getpc_b64 s[0:1]
+; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
+; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 s32, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX90A-NEXT:    v_and_b32_e32 v18, 0xffff8000, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s34, v18
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
+; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 3, v[2:3]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX90A-NEXT:    s_movk_i32 s1, 0x2000
+; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
+; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
+; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
+; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048
+; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
+; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
+; GFX90A-NEXT:    s_movk_i32 s0, 0x3000
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX90A-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(3)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(2)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX90A-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35]
+; GFX90A-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: clmem_read_simplified:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -242,17 +316,17 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v2
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v16
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v16
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2048
@@ -362,97 +436,98 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 17, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xfe000000, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s35
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], 3, v[1:2]
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v2, v3, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x5000
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_movk_i32 s0, 0x7f
 ; GFX8-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB1_2 Depth 2
-; GFX8-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:  .LBB1_2: ; %for.body
 ; GFX8-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xffffb000, v6
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, -1, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0xffffb800, v6
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, -1, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 0xffffc000, v6
-; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
-; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
-; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, -1, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xffffc800, v6
-; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, -1, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xffffd000, v6
-; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[12:13]
-; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[14:15]
-; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, -1, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xffffd800, v6
-; GFX8-NEXT:    v_addc_u32_e32 v19, vcc, -1, v7, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[16:17], v[16:17]
-; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[18:19]
-; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xffffe000, v6
-; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, -1, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xffffe800, v6
-; GFX8-NEXT:    flat_load_dwordx2 v[20:21], v[20:21]
-; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, -1, v7, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[22:23], v[22:23]
-; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xfffff000, v6
-; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, -1, v7, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[24:25], v[24:25]
-; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0xfffff800, v6
-; GFX8-NEXT:    v_addc_u32_e32 v27, vcc, -1, v7, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[26:27], v[26:27]
-; GFX8-NEXT:    flat_load_dwordx2 v[28:29], v[6:7]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x10000, v6
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0xffffb000, v5
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0xffffb800, v5
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0xffffc000, v5
+; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
+; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0xffffc800, v5
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0xffffd000, v5
+; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[11:12]
+; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xffffd800, v5
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, -1, v6, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
+; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 0xffffe000, v5
+; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 0xffffe800, v5
+; GFX8-NEXT:    flat_load_dwordx2 v[19:20], v[19:20]
+; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, -1, v6, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[21:22], v[21:22]
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, 0xfffff000, v5
+; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, -1, v6, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[23:24], v[23:24]
+; GFX8-NEXT:    v_add_u32_e32 v25, vcc, 0xfffff800, v5
+; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, -1, v6, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[25:26], v[25:26]
+; GFX8-NEXT:    flat_load_dwordx2 v[27:28], v[5:6]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x10000, v5
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX8-NEXT:    s_addk_i32 s1, 0x2000
 ; GFX8-NEXT:    s_cmp_gt_u32 s1, 0x3fffff
 ; GFX8-NEXT:    s_waitcnt vmcnt(10)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v8, v4
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v9, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(9)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v10, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v11, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v9, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(8)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v12, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v13, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v11, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v14, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v15, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v13, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v14, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v17, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v15, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v16, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v18, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v19, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v17, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v18, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v20, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v21, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v19, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v20, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v22, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v23, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v21, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v22, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v24, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v25, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v23, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v24, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v26, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v27, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v25, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v26, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v28, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v29, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v27, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v28, v4, vcc
 ; GFX8-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX8-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX8-NEXT:    ; in Loop: Header=BB1_1 Depth=1
@@ -463,10 +538,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s0, s1
 ; GFX8-NEXT:    s_branch .LBB1_1
 ; GFX8-NEXT:  .LBB1_5: ; %while.end
-; GFX8-NEXT:    v_mov_b32_e32 v2, s35
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v1
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s35
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: clmem_read:
@@ -490,12 +565,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
+; GFX900-NEXT:    v_lshlrev_b64 v[1:2], 3, v[1:2]
 ; GFX900-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
-; GFX900-NEXT:    v_lshl_or_b32 v1, v1, 3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, s35
+; GFX900-NEXT:    v_or_b32_e32 v1, v1, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, s35
 ; GFX900-NEXT:    v_add_co_u32_e32 v1, vcc, s34, v1
-; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v3, vcc
 ; GFX900-NEXT:    s_movk_i32 s0, 0x5000
 ; GFX900-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
@@ -605,15 +682,17 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 17, v0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_movk_i32 s1, 0x7f
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xfe000000, v1
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 3, v0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, s34
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, 0, s35, s0
+; GFX10-NEXT:    v_lshlrev_b64 v[1:2], 3, v[1:2]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v0
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, s34
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s35, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, 0x5000, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX10-NEXT:  .LBB1_1: ; %for.cond.preheader
@@ -717,13 +796,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    s_mov_b32 s32, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
 ; GFX90A-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
-; GFX90A-NEXT:    v_lshl_or_b32 v1, v1, 3, v0
+; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 3, v[2:3]
+; GFX90A-NEXT:    v_or_b32_e32 v1, v2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s35
 ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, s34, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v2, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v2, vcc
 ; GFX90A-NEXT:    s_movk_i32 s0, 0x5000
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
@@ -822,18 +903,20 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 17, v0
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0xff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 17, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_movk_i32 s1, 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xfe000000, v1
-; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 3, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b64 v[1:2], 3, v[1:2]
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, v1, s34
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, s35, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v0
+; GFX11-NEXT:    v_add_co_u32 v1, vcc_lo, v1, s34
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s35, v2, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v1, vcc_lo, 0x5000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
@@ -1052,38 +1135,39 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s35
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 2
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x400
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x800
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xc00
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1000
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1400
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1800
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1c00
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2000
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v19, v[5:6]
 ; GFX8-NEXT:    flat_load_dword v7, v[7:8]
 ; GFX8-NEXT:    flat_load_dword v8, v[9:10]
@@ -1091,89 +1175,90 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX8-NEXT:    flat_load_dword v10, v[13:14]
 ; GFX8-NEXT:    flat_load_dword v11, v[15:16]
 ; GFX8-NEXT:    flat_load_dword v12, v[17:18]
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x2400, v3
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x2400, v0
 ; GFX8-NEXT:    flat_load_dword v5, v[5:6]
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(8)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v19, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v19, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v8, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v8, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v9, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v10, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v11, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v11, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v12, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v12, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    flat_store_dword v[3:4], v0
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: Address32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v4
-; GFX9-NEXT:    v_mov_b32_e32 v3, 2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:1024
-; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:2048
-; GFX9-NEXT:    global_load_dword v8, v[0:1], off offset:3072
-; GFX9-NEXT:    global_load_dword v9, v[2:3], off
-; GFX9-NEXT:    global_load_dword v10, v[2:3], off offset:1024
-; GFX9-NEXT:    global_load_dword v11, v[2:3], off offset:2048
-; GFX9-NEXT:    global_load_dword v12, v[2:3], off offset:3072
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v2, v[0:1], off
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:1024
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NEXT:    v_add_u32_e32 v0, v6, v5
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add3_u32 v0, v7, v0, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add3_u32 v0, v11, v0, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v3
-; GFX9-NEXT:    global_store_dword v4, v0, s[34:35]
-; GFX9-NEXT:    s_endpgm
+; GFX900-LABEL: Address32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX900-NEXT:    s_mov_b32 s38, -1
+; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX900-NEXT:    s_add_u32 s36, s36, s3
+; GFX900-NEXT:    s_addc_u32 s37, s37, 0
+; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX900-NEXT:    s_getpc_b64 s[0:1]
+; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX900-NEXT:    v_mov_b32_e32 v31, v0
+; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_mov_b32 s32, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff8000, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, s35
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
+; GFX900-NEXT:    s_movk_i32 s0, 0x1000
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX900-NEXT:    global_load_dword v5, v[0:1], off
+; GFX900-NEXT:    global_load_dword v6, v[0:1], off offset:1024
+; GFX900-NEXT:    global_load_dword v7, v[0:1], off offset:2048
+; GFX900-NEXT:    global_load_dword v8, v[0:1], off offset:3072
+; GFX900-NEXT:    global_load_dword v9, v[2:3], off
+; GFX900-NEXT:    global_load_dword v10, v[2:3], off offset:1024
+; GFX900-NEXT:    global_load_dword v11, v[2:3], off offset:2048
+; GFX900-NEXT:    global_load_dword v12, v[2:3], off offset:3072
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX900-NEXT:    global_load_dword v2, v[0:1], off
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX900-NEXT:    s_waitcnt vmcnt(8)
+; GFX900-NEXT:    v_add_u32_e32 v0, v6, v5
+; GFX900-NEXT:    s_waitcnt vmcnt(6)
+; GFX900-NEXT:    v_add3_u32 v0, v7, v0, v8
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
+; GFX900-NEXT:    v_add3_u32 v0, v9, v0, v10
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
+; GFX900-NEXT:    v_add3_u32 v0, v11, v0, v12
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add3_u32 v0, v2, v0, v3
+; GFX900-NEXT:    global_store_dword v4, v0, s[34:35]
+; GFX900-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: Address32:
 ; GFX10:       ; %bb.0: ; %entry
@@ -1195,14 +1280,15 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 2
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x1000
@@ -1241,6 +1327,64 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    global_store_dword v8, v0, s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
+; GFX90A-LABEL: Address32:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT:    s_mov_b32 s38, -1
+; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX90A-NEXT:    s_add_u32 s36, s36, s3
+; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
+; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX90A-NEXT:    s_getpc_b64 s[0:1]
+; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
+; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 s32, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX90A-NEXT:    v_and_b32_e32 v4, 0xffff8000, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, s34, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v0, vcc
+; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 2, v[2:3]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v1, vcc
+; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v6, v[0:1], off offset:1024
+; GFX90A-NEXT:    global_load_dword v7, v[0:1], off offset:2048
+; GFX90A-NEXT:    global_load_dword v8, v[0:1], off offset:3072
+; GFX90A-NEXT:    global_load_dword v9, v[2:3], off
+; GFX90A-NEXT:    global_load_dword v10, v[2:3], off offset:1024
+; GFX90A-NEXT:    global_load_dword v11, v[2:3], off offset:2048
+; GFX90A-NEXT:    global_load_dword v12, v[2:3], off offset:3072
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dword v2, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX90A-NEXT:    s_waitcnt vmcnt(8)
+; GFX90A-NEXT:    v_add_u32_e32 v0, v6, v5
+; GFX90A-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-NEXT:    v_add3_u32 v0, v7, v0, v8
+; GFX90A-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-NEXT:    v_add3_u32 v0, v9, v0, v10
+; GFX90A-NEXT:    s_waitcnt vmcnt(2)
+; GFX90A-NEXT:    v_add3_u32 v0, v11, v0, v12
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add3_u32 v0, v2, v0, v3
+; GFX90A-NEXT:    global_store_dword v4, v0, s[34:35]
+; GFX90A-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: Address32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -1252,17 +1396,17 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v2
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v6
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b32 v7, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v8, v[0:1], off offset:1024
@@ -1369,87 +1513,89 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s35
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 3
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xf000
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xf800
-; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[3:4]
+; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 1, v4, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 1, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v7
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v7
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v8, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: Offset64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v12
-; GFX9-NEXT:    v_mov_b32_e32 v3, 3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
-; GFX9-NEXT:    s_movk_i32 s0, 0xf000
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
-; GFX9-NEXT:    s_endpgm
+; GFX900-LABEL: Offset64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX900-NEXT:    s_mov_b32 s38, -1
+; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX900-NEXT:    s_add_u32 s36, s36, s3
+; GFX900-NEXT:    s_addc_u32 s37, s37, 0
+; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX900-NEXT:    s_getpc_b64 s[0:1]
+; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX900-NEXT:    v_mov_b32_e32 v31, v0
+; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_mov_b32 s32, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff8000, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, s35
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v12
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
+; GFX900-NEXT:    s_movk_i32 s0, 0xf000
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
+; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX900-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
+; GFX900-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: Offset64:
 ; GFX10:       ; %bb.0: ; %entry
@@ -1471,14 +1617,15 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 3
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v12
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff8000, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v12
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0xfffff800
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
@@ -1501,6 +1648,56 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
+; GFX90A-LABEL: Offset64:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT:    s_mov_b32 s38, -1
+; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX90A-NEXT:    s_add_u32 s36, s36, s3
+; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
+; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX90A-NEXT:    s_getpc_b64 s[0:1]
+; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
+; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 s32, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX90A-NEXT:    v_and_b32_e32 v12, 0xffff8000, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s34, v12
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
+; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 3, v[2:3]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
+; GFX90A-NEXT:    s_movk_i32 s0, 0xf000
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
+; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX90A-NEXT:    s_waitcnt vmcnt(2)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX90A-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
+; GFX90A-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: Offset64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -1512,17 +1709,17 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff8000, v2
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v8
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v8
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo
@@ -1597,80 +1794,81 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s35
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 2
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX8-NEXT:    s_mov_b32 s0, 0x7ffff800
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_mov_b32 s0, 0x7ffffc00
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8-NEXT:    flat_load_dword v5, v[5:6]
 ; GFX8-NEXT:    flat_load_dword v6, v[7:8]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x80000000, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80000000, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v6, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    flat_store_dword v[3:4], v0
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: p32Offset64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v6
-; GFX9-NEXT:    v_mov_b32_e32 v3, 2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s0, 0x7ffff000
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0x80000000, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v7, v[0:1], off
-; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:2048
-; GFX9-NEXT:    global_load_dword v9, v[2:3], off offset:3072
-; GFX9-NEXT:    global_load_dword v10, v[4:5], off
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_u32_e32 v0, v8, v7
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
-; GFX9-NEXT:    global_store_dword v6, v0, s[34:35]
-; GFX9-NEXT:    s_endpgm
+; GFX900-LABEL: p32Offset64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX900-NEXT:    s_mov_b32 s38, -1
+; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX900-NEXT:    s_add_u32 s36, s36, s3
+; GFX900-NEXT:    s_addc_u32 s37, s37, 0
+; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX900-NEXT:    s_getpc_b64 s[0:1]
+; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX900-NEXT:    v_mov_b32_e32 v31, v0
+; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_mov_b32 s32, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff8000, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, s35
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v6
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, 0x7ffff000, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, 0x80000000, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX900-NEXT:    global_load_dword v7, v[0:1], off
+; GFX900-NEXT:    global_load_dword v8, v[2:3], off offset:2048
+; GFX900-NEXT:    global_load_dword v9, v[2:3], off offset:3072
+; GFX900-NEXT:    global_load_dword v10, v[4:5], off
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
+; GFX900-NEXT:    v_add_u32_e32 v0, v8, v7
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add3_u32 v0, v9, v0, v10
+; GFX900-NEXT:    global_store_dword v6, v0, s[34:35]
+; GFX900-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: p32Offset64:
 ; GFX10:       ; %bb.0: ; %entry
@@ -1692,14 +1890,15 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 2
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v4
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff8000, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x80000000
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
@@ -1716,6 +1915,51 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    global_store_dword v4, v0, s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
+; GFX90A-LABEL: p32Offset64:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT:    s_mov_b32 s38, -1
+; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX90A-NEXT:    s_add_u32 s36, s36, s3
+; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
+; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX90A-NEXT:    s_getpc_b64 s[0:1]
+; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
+; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 s32, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff8000, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s34, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
+; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 2, v[2:3]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x7ffff000, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x80000000, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dword v7, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v8, v[2:3], off offset:2048
+; GFX90A-NEXT:    global_load_dword v9, v[2:3], off offset:3072
+; GFX90A-NEXT:    global_load_dword v10, v[4:5], off
+; GFX90A-NEXT:    s_waitcnt vmcnt(2)
+; GFX90A-NEXT:    v_add_u32_e32 v0, v8, v7
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add3_u32 v0, v9, v0, v10
+; GFX90A-NEXT:    global_store_dword v6, v0, s[34:35]
+; GFX90A-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: p32Offset64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -1727,17 +1971,17 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v2
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v6
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x7ffff000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
@@ -2078,135 +2322,137 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s35
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 3
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x3800
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x3000
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2800
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
 ; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
 ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2000
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1800
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
 ; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1000
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x800, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v11
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v11
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v12, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v13, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v13, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v15, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v15, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v17, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v17, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v18, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: ReverseOrder:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff8000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v22
-; GFX9-NEXT:    v_mov_b32_e32 v3, 3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
-; GFX9-NEXT:    s_movk_i32 s0, 0x2000
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:2048
-; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
-; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off
-; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
-; GFX9-NEXT:    s_endpgm
+; GFX900-LABEL: ReverseOrder:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX900-NEXT:    s_mov_b32 s38, -1
+; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX900-NEXT:    s_add_u32 s36, s36, s3
+; GFX900-NEXT:    s_addc_u32 s37, s37, 0
+; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX900-NEXT:    s_getpc_b64 s[0:1]
+; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX900-NEXT:    v_mov_b32_e32 v31, v0
+; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_mov_b32 s32, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff8000, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, s35
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v22
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
+; GFX900-NEXT:    s_movk_i32 s0, 0x3000
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
+; GFX900-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:2048
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
+; GFX900-NEXT:    s_movk_i32 s0, 0x2000
+; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:2048
+; GFX900-NEXT:    s_movk_i32 s0, 0x1000
+; GFX900-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
+; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off
+; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
+; GFX900-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
+; GFX900-NEXT:    s_waitcnt vmcnt(6)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
+; GFX900-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
+; GFX900-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ReverseOrder:
 ; GFX10:       ; %bb.0: ; %entry
@@ -2228,14 +2474,15 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 3
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v20
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v0
@@ -2284,6 +2531,76 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
+; GFX90A-LABEL: ReverseOrder:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT:    s_mov_b32 s38, -1
+; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX90A-NEXT:    s_add_u32 s36, s36, s3
+; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
+; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX90A-NEXT:    s_getpc_b64 s[0:1]
+; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
+; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 s32, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX90A-NEXT:    v_and_b32_e32 v22, 0xffff8000, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s34, v22
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
+; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 3, v[2:3]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX90A-NEXT:    s_movk_i32 s0, 0x3000
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
+; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:2048
+; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
+; GFX90A-NEXT:    s_movk_i32 s0, 0x2000
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:2048
+; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
+; GFX90A-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
+; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off
+; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
+; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
+; GFX90A-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(2)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
+; GFX90A-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
+; GFX90A-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: ReverseOrder:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -2295,17 +2612,17 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v2
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v16
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v16
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
@@ -2416,69 +2733,71 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s35
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 3
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v4, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x800
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v6, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: negativeoffset:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v8
-; GFX9-NEXT:    v_mov_b32_e32 v3, 3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
-; GFX9-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35]
-; GFX9-NEXT:    s_endpgm
+; GFX900-LABEL: negativeoffset:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX900-NEXT:    s_mov_b32 s38, -1
+; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX900-NEXT:    s_add_u32 s36, s36, s3
+; GFX900-NEXT:    s_addc_u32 s37, s37, 0
+; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX900-NEXT:    s_getpc_b64 s[0:1]
+; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX900-NEXT:    v_mov_b32_e32 v31, v0
+; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_mov_b32 s32, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff8000, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, s35
+; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v8
+; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
+; GFX900-NEXT:    s_movk_i32 s0, 0x1000
+; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v1, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
+; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
+; GFX900-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35]
+; GFX900-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: negativeoffset:
 ; GFX10:       ; %bb.0: ; %entry
@@ -2500,18 +2819,19 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 3
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
@@ -2521,6 +2841,49 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
+; GFX90A-LABEL: negativeoffset:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT:    s_mov_b32 s38, -1
+; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX90A-NEXT:    s_add_u32 s36, s36, s3
+; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
+; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX90A-NEXT:    s_getpc_b64 s[0:1]
+; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
+; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 s32, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX90A-NEXT:    v_and_b32_e32 v8, 0xffff8000, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s34, v8
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
+; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 3, v[2:3]
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v1, vcc
+; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
+; GFX90A-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35]
+; GFX90A-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: negativeoffset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -2532,22 +2895,22 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff8000, v2
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:-2048
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 90a06d8e2d0a2f8..08db1e7fee259d6 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10083,16 +10083,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_add_u32 s40, s40, s3
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
-; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, -1, v0
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v5, -1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
+; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
+; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 8, v5
+; GFX6-NEXT:    v_mov_b32_e32 v8, v6
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240
 ; GFX6-NEXT:    s_addc_u32 s41, s41, 0
-; GFX6-NEXT:    s_mov_b32 s2, 0x83400
+; GFX6-NEXT:    s_mov_b32 s2, 0x83800
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10101,7 +10102,16 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224
+; GFX6-NEXT:    s_mov_b32 s2, 0x83400
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208
 ; GFX6-NEXT:    s_mov_b32 s2, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10110,7 +10120,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192
 ; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10119,7 +10129,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176
 ; GFX6-NEXT:    s_mov_b32 s2, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10128,7 +10138,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160
 ; GFX6-NEXT:    s_mov_b32 s2, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10137,7 +10147,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144
 ; GFX6-NEXT:    s_mov_b32 s2, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10146,7 +10156,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128
 ; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10155,7 +10165,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112
 ; GFX6-NEXT:    s_mov_b32 s2, 0x81800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10164,7 +10174,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96
 ; GFX6-NEXT:    s_mov_b32 s2, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10173,7 +10183,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80
 ; GFX6-NEXT:    s_mov_b32 s2, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10181,27 +10191,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_load_dwordx4 v[16:19], v[5:6], s[4:7], 0 addr64 offset:64
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16
 ; GFX6-NEXT:    s_mov_b32 s2, 0x80800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:32
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:32
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
@@ -10216,9 +10217,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX6-NEXT:    buffer_load_dwordx4 v[20:23], v[5:6], s[4:7], 0 addr64 offset:48
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48
+; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(3)
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 1
 ; GFX6-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX6-NEXT:    buffer_store_dword v7, v4, s[40:43], 0 offen
@@ -10236,7 +10245,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
-; GFX6-NEXT:    s_mov_b32 s12, 0x83800
+; GFX6-NEXT:    s_mov_b32 s12, 0x83c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10276,7 +10285,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x84000
+; GFX6-NEXT:    s_mov_b32 s38, 0x84400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10284,7 +10293,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x83800
+; GFX6-NEXT:    s_mov_b32 s38, 0x83c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
@@ -10312,7 +10321,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x84800
+; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10320,7 +10329,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x84000
+; GFX6-NEXT:    s_mov_b32 s38, 0x84400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
@@ -10348,7 +10357,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x85000
+; GFX6-NEXT:    s_mov_b32 s38, 0x85400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10356,7 +10365,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x84800
+; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
@@ -10380,7 +10389,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s38, 0x85800
+; GFX6-NEXT:    s_mov_b32 s38, 0x85c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10394,7 +10403,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s6, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s7, 3
-; GFX6-NEXT:    s_mov_b32 s36, 0x85c00
+; GFX6-NEXT:    s_mov_b32 s36, 0x86000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10406,7 +10415,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 0
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 1
-; GFX6-NEXT:    s_mov_b32 s4, 0x86000
+; GFX6-NEXT:    s_mov_b32 s4, 0x86400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10414,7 +10423,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x85000
+; GFX6-NEXT:    s_mov_b32 s38, 0x85400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
@@ -10432,7 +10441,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2170
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2180
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
@@ -10447,7 +10456,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 vcc, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2180
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2190
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
@@ -10463,7 +10472,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[34:35], vcc
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s6, 0x85800
+; GFX6-NEXT:    s_mov_b32 s6, 0x85c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
@@ -10475,46 +10484,35 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX6-NEXT:    s_mov_b32 s2, 0x83800
+; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_mov_b32 s2, 0x84000
-; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_mov_b32 s2, 0x84800
-; GFX6-NEXT:    buffer_store_dword v16, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v17, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v18, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v19, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    v_mov_b32_e32 v0, v20
-; GFX6-NEXT:    v_mov_b32_e32 v1, v21
-; GFX6-NEXT:    v_mov_b32_e32 v2, v22
-; GFX6-NEXT:    v_mov_b32_e32 v3, v23
-; GFX6-NEXT:    s_waitcnt expcnt(3)
+; GFX6-NEXT:    s_mov_b32 s2, 0x84400
+; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(4)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v17
+; GFX6-NEXT:    v_mov_b32_e32 v1, v18
+; GFX6-NEXT:    v_mov_b32_e32 v2, v19
+; GFX6-NEXT:    v_mov_b32_e32 v3, v20
+; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v16, off, s[40:43], s2 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v17, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v18, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v19, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s2, 0x84000
-; GFX6-NEXT:    v_mov_b32_e32 v23, v3
-; GFX6-NEXT:    buffer_load_dword v12, off, s[40:43], s2 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s2, 0x83800
-; GFX6-NEXT:    v_mov_b32_e32 v22, v2
-; GFX6-NEXT:    v_mov_b32_e32 v21, v1
-; GFX6-NEXT:    v_mov_b32_e32 v20, v0
+; GFX6-NEXT:    v_mov_b32_e32 v20, v3
+; GFX6-NEXT:    buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
+; GFX6-NEXT:    v_mov_b32_e32 v19, v2
+; GFX6-NEXT:    v_mov_b32_e32 v18, v1
+; GFX6-NEXT:    v_mov_b32_e32 v17, v0
 ; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
@@ -10547,175 +10545,183 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83400
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s4, 0x83800
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[5:6], 8
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GFX6-NEXT:    s_mov_b32 s4, 0x83400
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:240
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:224
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:208
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:192
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:176
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:160
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x81800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:144
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:128
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:112
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x80c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:96
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80
+; GFX6-NEXT:    buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v17, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v18, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v19, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v20, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s4, 0x80800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:80
-; GFX6-NEXT:    buffer_store_dwordx4 v[16:19], v[5:6], s[0:3], 0 addr64 offset:64
-; GFX6-NEXT:    buffer_store_dwordx4 v[20:23], v[5:6], s[0:3], 0 addr64 offset:48
-; GFX6-NEXT:    buffer_store_dwordx4 v[12:15], v[5:6], s[0:3], 0 addr64 offset:32
-; GFX6-NEXT:    s_waitcnt expcnt(3)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48
+; GFX6-NEXT:    buffer_store_dwordx4 v[13:16], v[4:5], s[0:3], 0 addr64 offset:32
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:16
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:16
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-FLATSCR-LABEL: test_limited_sgpr:
 ; GFX9-FLATSCR:       ; %bb.0: ; %entry
 ; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
 ; GFX9-FLATSCR-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX9-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX9-FLATSCR-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
+; GFX9-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v5, -1, v0
+; GFX9-FLATSCR-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:240
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:240
 ; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v7, 1
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v0, s[38:39] offset:224
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:208
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v0, s[38:39] offset:192
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:176
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v0, s[38:39] offset:160
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:144
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2080
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:128
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:112
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:96
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2050
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:80
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2040
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:208
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:64
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2030
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:160
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:144
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:48
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v5, s[38:39]
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-FLATSCR-NEXT:    v_lshl_add_u32 v4, v7, 13, v4
-; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX9-FLATSCR-NEXT:    scratch_store_dword v4, v6, off
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:64
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:48
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2080
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:32
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:32
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:16
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:16
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v0, s[38:39]
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v4, 16
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    v_lshl_add_u32 v4, v0, 13, v4
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-FLATSCR-NEXT:    scratch_store_dword v4, v7, off
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ; def s[0:7]
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
@@ -10744,27 +10750,23 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39]
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[15:18], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, v11
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[19:22], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, v12
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, v13
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v3, v14
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[7:10], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2100
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_nop 0
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v14, v3
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v13, v2
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v12, v1
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
@@ -10779,58 +10781,62 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:  .LBB1_2: ; %ret
 ; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:112
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    v_lshlrev_b64 v[4:5], 8, v[5:6]
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, s37
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v4, vcc, s36, v4
+; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:96
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[12:15], off offset:240
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[8:11], off offset:224
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:80
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:208
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:192
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2080
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:64
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:48
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:176
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[16:19], off offset:160
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:32
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2050
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:16
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[7:10], s[36:37]
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[16:19], off offset:144
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:128
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2040
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:240
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[12:15], off offset:112
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:96
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2030
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:80
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[11:14], s[36:37] offset:160
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:144
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:64
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:128
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:48
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-FLATSCR-NEXT:    s_endpgm
 ;
 ; GFX10-FLATSCR-LABEL: test_limited_sgpr:
@@ -10841,31 +10847,32 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-FLATSCR-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
 ; GFX10-FLATSCR-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v7, 1
 ; GFX10-FLATSCR-NEXT:    s_mov_b32 s33, exec_lo
-; GFX10-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
-; GFX10-FLATSCR-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
+; GFX10-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v5, -1, v0
+; GFX10-FLATSCR-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; GFX10-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-FLATSCR-NEXT:    s_clause 0xf
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[35:38], v5, s[38:39] offset:240
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[31:34], v5, s[38:39] offset:224
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[27:30], v5, s[38:39] offset:208
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v5, s[38:39] offset:192
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[19:22], v5, s[38:39] offset:176
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[15:18], v5, s[38:39] offset:160
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:144
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v5, s[38:39] offset:128
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[63:66], v5, s[38:39] offset:112
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[59:62], v5, s[38:39] offset:96
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[55:58], v5, s[38:39] offset:80
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[51:54], v5, s[38:39] offset:64
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[47:50], v5, s[38:39] offset:48
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[43:46], v5, s[38:39] offset:32
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[39:42], v5, s[38:39] offset:16
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39]
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[64:67], v0, s[38:39] offset:240
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[60:63], v0, s[38:39] offset:224
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[56:59], v0, s[38:39] offset:208
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[52:55], v0, s[38:39] offset:192
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[48:51], v0, s[38:39] offset:176
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[44:47], v0, s[38:39] offset:160
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[40:43], v0, s[38:39] offset:144
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[36:39], v0, s[38:39] offset:128
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[32:35], v0, s[38:39] offset:112
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[28:31], v0, s[38:39] offset:96
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[24:27], v0, s[38:39] offset:80
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v0, s[38:39] offset:64
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v0, s[38:39] offset:48
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[12:15], v0, s[38:39] offset:32
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v0, s[38:39] offset:16
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v0, s[38:39]
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    v_lshl_add_u32 v4, v0, 13, 16
-; GFX10-FLATSCR-NEXT:    scratch_store_dword v4, v6, off
+; GFX10-FLATSCR-NEXT:    scratch_store_dword v4, v7, off
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ; def s[0:7]
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
@@ -10894,124 +10901,124 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35]
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v88, v58
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v92, v62
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v87, v57
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v86, v56
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v85, v55
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v91, v61
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v90, v60
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v89, v59
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v34
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[63:66], s0 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v68, v38
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v33
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v32
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v57, v31
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v67, v37
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v66, v36
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v65, v35
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v10
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v72, v42
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v76, v46
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v80, v50
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v84, v54
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v33, v7
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v71, v41
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v70, v40
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v69, v39
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v40, v14
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v75, v45
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v74, v44
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v73, v43
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v44, v18
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v79, v49
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v78, v48
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v77, v47
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v48, v22
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v83, v53
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v82, v52
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v81, v51
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v52, v26
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v56, v30
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v9
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v34, v8
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v11
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v41, v15
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v45, v19
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v49, v23
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v53, v27
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v39, v13
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v12
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v43, v17
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v42, v16
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v47, v21
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v46, v20
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v51, v25
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v50, v24
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v29
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v28
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v88, v59
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v92, v63
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v87, v58
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v86, v57
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v85, v56
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v91, v62
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v90, v61
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v89, v60
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v35
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[64:67], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v68, v39
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v34
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v33
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v57, v32
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v67, v38
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v66, v37
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v65, v36
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v11
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v72, v43
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v76, v47
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v80, v51
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v84, v55
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v33, v8
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v71, v42
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v70, v41
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v69, v40
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v40, v15
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v75, v46
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v74, v45
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v73, v44
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v44, v19
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v79, v50
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v78, v49
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v77, v48
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v48, v23
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v83, v54
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v82, v53
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v81, v52
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v52, v27
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v56, v31
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v10
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v34, v9
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v12
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v41, v16
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v45, v20
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v49, v24
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v53, v28
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v39, v14
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v13
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v43, v18
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v42, v17
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v47, v22
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v46, v21
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v51, v26
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v50, v25
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v30
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v29
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v7, v33
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, v53
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, v49
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, v45
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v15, v41
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, v37
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v8, v34
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v9, v35
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v36
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v31, v57
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v54
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v55
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v30, v56
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, v50
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v51
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v52
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, v46
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, v47
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v48
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, v42
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v17, v43
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, v44
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, v38
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v13, v39
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, v40
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v32, v58
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v33, v59
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v34, v60
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v8, v33
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v53
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, v49
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, v45
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, v41
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, v37
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v9, v34
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v35
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, v36
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v32, v57
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v54
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v30, v55
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v31, v56
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v50
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v51
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, v52
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, v46
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v47
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, v48
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v17, v42
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, v43
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, v44
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v13, v38
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, v39
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v15, v40
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v33, v58
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v34, v59
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v60
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v65
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v66
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v67
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v68
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[63:66], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v89
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v85
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v51, v81
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v47, v77
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v43, v73
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v39, v69
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v90
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v61, v91
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v62, v92
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v56, v86
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v57, v87
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v88
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v52, v82
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v53, v83
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v84
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v48, v78
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v49, v79
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v50, v80
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v44, v74
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v45, v75
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v46, v76
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v40, v70
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v41, v71
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v42, v72
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v65
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v66
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v67
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v39, v68
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[64:67], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v89
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v56, v85
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v52, v81
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v48, v77
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v44, v73
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v40, v69
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v61, v90
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v62, v91
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v63, v92
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v57, v86
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v87
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v88
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v53, v82
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v83
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v84
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v49, v78
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v50, v79
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v51, v80
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v45, v74
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v46, v75
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v47, v76
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v41, v70
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v42, v71
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v43, v72
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
@@ -11024,23 +11031,26 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:  .LBB1_2: ; %ret
 ; GFX10-FLATSCR-NEXT:    s_or_b32 exec_lo, exec_lo, s33
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[63:66], s[36:37] offset:112
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[59:62], s[36:37] offset:96
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[55:58], s[36:37] offset:80
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[51:54], s[36:37] offset:64
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[47:50], s[36:37] offset:48
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[43:46], s[36:37] offset:32
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[39:42], s[36:37] offset:16
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37]
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[35:38], s[36:37] offset:240
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[31:34], s[36:37] offset:224
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[27:30], s[36:37] offset:208
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[23:26], s[36:37] offset:192
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[19:22], s[36:37] offset:176
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[15:18], s[36:37] offset:160
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[11:14], s[36:37] offset:144
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[7:10], s[36:37] offset:128
+; GFX10-FLATSCR-NEXT:    v_lshlrev_b64 v[4:5], 8, v[5:6]
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v4, vcc_lo, s36, v4
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s37, v5, vcc_lo
+; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[64:67], off offset:240
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[60:63], off offset:224
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[56:59], off offset:208
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[52:55], off offset:192
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[48:51], off offset:176
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[44:47], off offset:160
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[40:43], off offset:144
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[36:39], off offset:128
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[32:35], off offset:112
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[28:31], off offset:96
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[24:27], off offset:80
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:64
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[16:19], off offset:48
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[12:15], off offset:32
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[8:11], off offset:16
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-FLATSCR-NEXT:    s_endpgm
 entry:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index cc73302f8563733..930ba80ad69638d 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -571,7 +571,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
   ; SI-NEXT:   successors: %bb.2(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
-  ; SI-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
+  ; SI-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
   ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
   ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4)
   ; SI-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3

diff  --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index dfbc4790e63b0eb..b8d18f56b760239 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -6,18 +6,22 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v5, s[4:5]
+; GFX906-NEXT:    global_load_dword v2, v1, s[4:5]
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v2, v5, s[6:7]
+; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, s7
+; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
+; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX906-NEXT:    global_load_dword v2, v[2:3], off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
@@ -50,11 +54,11 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v6, s[4:5]
+; GFX906-NEXT:    global_load_dword v2, v1, s[4:5]
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
@@ -62,7 +66,11 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v2, v6, s[6:7]
+; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, s7
+; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
+; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX906-NEXT:    global_load_dword v2, v[2:3], off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
@@ -98,32 +106,36 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
-; GFX906-NEXT:    v_mov_b32_e32 v5, 0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[2:3], v1, s[4:5]
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT:    v_lshrrev_b64 v[4:5], 24, v[2:3]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[6:7]
+; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 3, v[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, s7
+; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
+; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX906-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT:    v_lshrrev_b64 v[4:5], 24, v[2:3]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
 ; GFX906-NEXT:  .LBB2_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_byte v5, v2, s[2:3] offset:4
-; GFX906-NEXT:    global_store_dword v5, v0, s[2:3]
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
+; GFX906-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_byte v1, v3, s[2:3] offset:4
+; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -147,42 +159,46 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v10, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[2:3], v1, s[4:5]
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v10, s[6:7]
+; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 3, v[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, s7
+; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
+; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX906-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX906-NEXT:  .LBB3_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
-; GFX906-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx2 v3, v[0:1], s[2:3]
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v7
+; GFX906-NEXT:    v_or_b32_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v6
+; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v4
+; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx2 v1, v[2:3], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -206,64 +222,68 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v18, 4, v0
-; GFX906-NEXT:    v_mov_b32_e32 v5, 0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v18, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT:    global_load_dwordx4 v[2:5], v1, s[4:5]
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v18, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 4, v[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, s7
+; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
+; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX906-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
 ; GFX906-NEXT:  .LBB4_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v17
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v15
-; GFX906-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v14
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v12
-; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v11
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v8
-; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v5, v[0:3], s[2:3]
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v15
+; GFX906-NEXT:    v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v14
+; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v12
+; GFX906-NEXT:    v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v11
+; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v9
+; GFX906-NEXT:    v_or_b32_sdwa v4, v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v8
+; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v6
+; GFX906-NEXT:    v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -286,114 +306,118 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-LABEL: v32i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v31, 5, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX906-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v31, s[4:5] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v31, s[4:5]
+; GFX906-NEXT:    global_load_dwordx4 v[2:5], v1, s[4:5] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[6:9], v1, s[4:5]
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 24, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 8, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 8, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v31, s[6:7] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v31, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 24, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
+; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 5, v[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, s7
+; GFX906-NEXT:    v_add_co_u32_e32 v10, vcc, s6, v2
+; GFX906-NEXT:    v_addc_co_u32_e32 v11, vcc, v0, v3, vcc
+; GFX906-NEXT:    global_load_dwordx4 v[2:5], v[10:11], off offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[6:9], v[10:11], off
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 8, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 8, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
 ; GFX906-NEXT:  .LBB5_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
-; GFX906-NEXT:    v_lshlrev_b16_e32 v31, 8, v33
+; GFX906-NEXT:    v_lshlrev_b16_e32 v28, 8, v28
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v32
+; GFX906-NEXT:    v_or_b32_sdwa v28, v30, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v31
+; GFX906-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v27
+; GFX906-NEXT:    v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v8, 8, v25
+; GFX906-NEXT:    v_or_b32_sdwa v8, v26, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v24
+; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v9, 8, v22
+; GFX906-NEXT:    v_or_b32_sdwa v9, v23, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v21
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v19
+; GFX906-NEXT:    v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v18
+; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v16
+; GFX906-NEXT:    v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v15
+; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v13
+; GFX906-NEXT:    v_or_b32_sdwa v4, v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v12
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
-; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
-; GFX906-NEXT:    v_lshlrev_b16_e32 v24, 8, v24
-; GFX906-NEXT:    v_lshlrev_b16_e32 v23, 8, v23
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT:    v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v9, v[5:8], s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v20
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v18
-; GFX906-NEXT:    v_or_b32_sdwa v5, v19, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v17
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v15
-; GFX906-NEXT:    v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v14
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v12
-; GFX906-NEXT:    v_or_b32_sdwa v5, v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v11
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v9, v[1:4], s[0:1] offset:16
+; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v10
+; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1]
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1] offset:16
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -422,1548 +446,1540 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    s_add_u32 s8, s8, s3
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v63, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX906-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v63, s[4:5] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v63, s[4:5] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[4:5] offset:192
+; GFX906-NEXT:    global_load_dwordx4 v[18:21], v2, s[4:5] offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[6:9], v2, s[4:5] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[10:13], v2, s[4:5] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[14:17], v2, s[4:5] offset:192
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v21
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v21
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v20
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v20
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v20
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v19
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v19
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v18
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v18
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v9
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v9
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v6
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v6
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v13
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v13
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v13
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v12
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v12
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v11
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v11
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v10
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v10
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v17
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v17
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v16
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v16
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v15
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v15
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v14
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v14
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v14
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[18:21], v2, s[4:5] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[22:25], v2, s[4:5] offset:160
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v21
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v21
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v20
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v20
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v20
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v19
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v19
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v18
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v18
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v25
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v25
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v24
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v24
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v24
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v23
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v23
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v22
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v22
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v22
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[26:29], v2, s[4:5] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[30:33], v2, s[4:5] offset:128
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v29
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v29
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v29
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v28
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v28
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v28
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v27
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v27
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v27
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v26
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v26
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v26
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v33
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v33
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v32
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v32
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v32
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v31
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v31
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v30
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v30
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v30
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[34:37], v2, s[4:5] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[38:41], v2, s[4:5] offset:96
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v37
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v37
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v37
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v36
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v36
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v36
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v35
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v35
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v34
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v34
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v34
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v41
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v41
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v41
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v40
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v40
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v40
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v39
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v39
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v39
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v38
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v38
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v38
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[42:45], v2, s[4:5] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[46:49], v2, s[4:5] offset:64
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v45
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v45
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v45
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v44
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v44
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v44
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v43
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v43
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v43
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v42
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v42
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v42
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v49
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v49
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v49
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v48
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v48
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v48
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v47
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v47
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v47
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v46
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v46
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v46
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[50:53], v2, s[4:5] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[54:57], v2, s[4:5] offset:32
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v53
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v53
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v53
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v52
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v52
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v52
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v51
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v51
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v51
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v50
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v50
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v57
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v57
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v57
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v56
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v56
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v56
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v55
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v55
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v55
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v54
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v54
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v54
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[58:61], v2, s[4:5] offset:16
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[2:5], v2, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v61
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v61
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v61
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v60
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v60
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v60
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v59
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v59
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v59
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v58
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v58
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v58
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v5
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v5
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v4
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v4
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v4
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v3
+; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 16, v2
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v2
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB6_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 3, v[0:1]
+; GFX906-NEXT:    v_mov_b32_e32 v0, s7
+; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
+; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; GFX906-NEXT:    global_load_dwordx4 v[18:21], v[2:3], off offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[14:17], v[2:3], off offset:192
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[4:5] offset:160
+; GFX906-NEXT:    global_load_dwordx4 v[18:21], v[2:3], off offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[22:25], v[2:3], off offset:160
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[4:5] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[4:5] offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[26:29], v[2:3], off offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[30:33], v[2:3], off offset:128
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[4:5] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[4:5] offset:96
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[4:5] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[4:5] offset:64
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[4:5] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[4:5] offset:32
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[4:5] offset:16
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB6_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7] offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v63, s[6:7] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v63, s[6:7] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[6:7] offset:192
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v3
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v0
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[6:7] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[6:7] offset:160
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[6:7] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[6:7] offset:128
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[6:7] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[6:7] offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[34:37], v[2:3], off offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[38:41], v[2:3], off offset:96
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[6:7] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[6:7] offset:64
+; GFX906-NEXT:    global_load_dwordx4 v[42:45], v[2:3], off offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[46:49], v[2:3], off offset:64
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[6:7] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[6:7] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[50:53], v[2:3], off offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[54:57], v[2:3], off offset:32
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v57
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v57
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v57
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[58:61], v[2:3], off offset:16
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v61
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v61
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v61
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v60
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v60
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v60
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v59
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v59
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v59
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v58
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v58
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v58
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v4
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v2
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
 ; GFX906-NEXT:  .LBB6_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v63
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v62
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v63, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
 ; GFX906-NEXT:    v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX906-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3]
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v60, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v61, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v58, 8, v58
 ; GFX906-NEXT:    v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX906-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:16
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v56, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v54, 8, v54
 ; GFX906-NEXT:    v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
+; GFX906-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:32
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v50, 8, v50
 ; GFX906-NEXT:    v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
+; GFX906-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:48
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v46, 8, v46
 ; GFX906-NEXT:    v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:64
+; GFX906-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:64
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v44, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v45, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v42, 8, v42
 ; GFX906-NEXT:    v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:80
+; GFX906-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:80
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v38, 8, v38
 ; GFX906-NEXT:    v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:96
+; GFX906-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:96
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v34, 8, v34
 ; GFX906-NEXT:    v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:112
+; GFX906-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:112
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v32, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
 ; GFX906-NEXT:    v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:128
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:128
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v29, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
 ; GFX906-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:144
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:144
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v25, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v22, 8, v22
 ; GFX906-NEXT:    v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:160
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:160
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v4, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v5, v21, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v18, 8, v18
 ; GFX906-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:176
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:176
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v5, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
-; GFX906-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v14, 8, v14
+; GFX906-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:192
+; GFX906-NEXT:    v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:192
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:208
+; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
+; GFX906-NEXT:    v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; GFX906-NEXT:    v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:208
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(4)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
+; GFX906-NEXT:    v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
+; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:224
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(7)
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(5)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
+; GFX906-NEXT:    v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
+; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:240
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll
index fe70e9e426ddfd2..75a21bdc0ff3b86 100644
--- a/llvm/test/CodeGen/AMDGPU/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xnor.ll
@@ -113,7 +113,8 @@ define amdgpu_kernel void @xnor_v_s_i32_one_use(ptr addrspace(1) %out, i32 %s) {
 ; GCN-LABEL: {{^}}xnor_i64_s_v_one_use
 ; GCN-NOT: s_xnor_b64
 ; GCN: s_not_b64
-; GCN: v_xor_b32_e32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 ; GCN-DL: v_xnor_b32
 ; GCN-DL: v_xnor_b32
 define amdgpu_kernel void @xnor_i64_s_v_one_use(
@@ -131,7 +132,8 @@ entry:
 ; GCN-LABEL: {{^}}xnor_i64_v_s_one_use
 ; GCN-NOT: s_xnor_b64
 ; GCN: s_not_b64
-; GCN: v_xor_b32_e32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 ; GCN-DL: v_xnor_b32
 ; GCN-DL: v_xnor_b32
 define amdgpu_kernel void @xnor_i64_v_s_one_use(

diff  --git a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
index 26d8905b48d045a..f2a53653ec2ffc1 100644
--- a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
+++ b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
@@ -8,8 +8,9 @@ define void @BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i(ptr, i32 %c_nbloc
 ; CHECK-NEXT:    movl %edx, %edx
 ; CHECK-NEXT:    movl (%rdi,%rdx,4), %edx
 ; CHECK-NEXT:    movzbl %dl, %r10d
-; CHECK-NEXT:    addl $4, %r10d
+; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    shrl $8, %edx
+; CHECK-NEXT:    addl $4, %r10d
 ; CHECK-NEXT:    movl (%rdi,%rdx,4), %edx
 ; CHECK-NEXT:    movzbl %dl, %edi
 ; CHECK-NEXT:    shrl $8, %edx

diff  --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
index 8ff4f4067dabda2..ad85a090010f8ea 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
@@ -1392,8 +1392,11 @@ return:                                           ; preds = %entry, %if.then
 define i64 @atomic_shl1_xor_64_const_br(ptr %v) nounwind {
 ; CHECK-LABEL: atomic_shl1_xor_64_const_br:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    lock btcq $4, (%rdi)
-; CHECK-NEXT:    jae .LBB48_1
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    shlq $4, %rax
+; CHECK-NEXT:    je .LBB48_1
 ; CHECK-NEXT:  # %bb.2: # %if.then
 ; CHECK-NEXT:    movq 32(%rdi), %rax
 ; CHECK-NEXT:    retq
@@ -1455,9 +1458,12 @@ return:                                           ; preds = %entry, %if.then
 define i64 @atomic_shl1_xor_64_const_brz(ptr %v) nounwind {
 ; CHECK-LABEL: atomic_shl1_xor_64_const_brz:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    lock btcq $4, (%rdi)
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    shlq $4, %rax
 ; CHECK-NEXT:    movl $123, %eax
-; CHECK-NEXT:    jae .LBB50_1
+; CHECK-NEXT:    je .LBB50_1
 ; CHECK-NEXT:  # %bb.2: # %return
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB50_1: # %if.then
@@ -1518,8 +1524,11 @@ return:                                           ; preds = %entry, %if.then
 define i64 @atomic_shl1_xor_64_const_brnz(ptr %v) nounwind {
 ; CHECK-LABEL: atomic_shl1_xor_64_const_brnz:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    lock btcq $4, (%rdi)
-; CHECK-NEXT:    jae .LBB52_1
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    shlq $4, %rax
+; CHECK-NEXT:    je .LBB52_1
 ; CHECK-NEXT:  # %bb.2: # %if.then
 ; CHECK-NEXT:    movq 32(%rdi), %rax
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/avx512vnni-combine.ll b/llvm/test/CodeGen/X86/avx512vnni-combine.ll
index f0c8a7e2083261f..7a0527be0541941 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-combine.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-combine.ll
@@ -73,7 +73,7 @@ define <8 x i64> @foo_512(i32 %0, <8 x i64> %1, <8 x i64> %2, ptr %3) {
 ; CHECK-NEXT:  # %bb.4: # %.preheader
 ; CHECK-NEXT:    shlq $6, %rcx
 ; CHECK-NEXT:    addq %rcx, %rsi
-; CHECK-NEXT:    shll $6, %eax
+; CHECK-NEXT:    shlq $6, %rax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_5: # =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/X86/avxvnni-combine.ll b/llvm/test/CodeGen/X86/avxvnni-combine.ll
index 75e29df9f34acf8..d8e73a5cf37d8d9 100644
--- a/llvm/test/CodeGen/X86/avxvnni-combine.ll
+++ b/llvm/test/CodeGen/X86/avxvnni-combine.ll
@@ -78,7 +78,7 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) {
 ; AVX-NEXT:  # %bb.4: # %.preheader
 ; AVX-NEXT:    shlq $4, %rcx
 ; AVX-NEXT:    addq %rcx, %rsi
-; AVX-NEXT:    shll $4, %eax
+; AVX-NEXT:    shlq $4, %rax
 ; AVX-NEXT:    xorl %ecx, %ecx
 ; AVX-NEXT:    .p2align 4, 0x90
 ; AVX-NEXT:  .LBB1_5: # =>This Inner Loop Header: Depth=1
@@ -125,7 +125,7 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) {
 ; AVX512-NEXT:  # %bb.4: # %.preheader
 ; AVX512-NEXT:    shlq $4, %rcx
 ; AVX512-NEXT:    addq %rcx, %rsi
-; AVX512-NEXT:    shll $4, %eax
+; AVX512-NEXT:    shlq $4, %rax
 ; AVX512-NEXT:    xorl %ecx, %ecx
 ; AVX512-NEXT:    .p2align 4, 0x90
 ; AVX512-NEXT:  .LBB1_5: # =>This Inner Loop Header: Depth=1
@@ -425,7 +425,7 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) {
 ; AVX-NEXT:  # %bb.4: # %.preheader
 ; AVX-NEXT:    shlq $5, %rcx
 ; AVX-NEXT:    addq %rcx, %rsi
-; AVX-NEXT:    shll $5, %eax
+; AVX-NEXT:    shlq $5, %rax
 ; AVX-NEXT:    xorl %ecx, %ecx
 ; AVX-NEXT:    .p2align 4, 0x90
 ; AVX-NEXT:  .LBB4_5: # =>This Inner Loop Header: Depth=1
@@ -472,7 +472,7 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) {
 ; AVX512-NEXT:  # %bb.4: # %.preheader
 ; AVX512-NEXT:    shlq $5, %rcx
 ; AVX512-NEXT:    addq %rcx, %rsi
-; AVX512-NEXT:    shll $5, %eax
+; AVX512-NEXT:    shlq $5, %rax
 ; AVX512-NEXT:    xorl %ecx, %ecx
 ; AVX512-NEXT:    .p2align 4, 0x90
 ; AVX512-NEXT:  .LBB4_5: # =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll
index 81eac5676bb5c3c..17fd612b812ebc1 100644
--- a/llvm/test/CodeGen/X86/bswap.ll
+++ b/llvm/test/CodeGen/X86/bswap.ll
@@ -168,8 +168,8 @@ define i64 @not_bswap() {
 ; CHECK64-NEXT:    movzwl var16(%rip), %eax
 ; CHECK64-NEXT:    movl %eax, %ecx
 ; CHECK64-NEXT:    shrl $8, %ecx
-; CHECK64-NEXT:    shll $8, %eax
-; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    shlq $8, %rax
+; CHECK64-NEXT:    orq %rcx, %rax
 ; CHECK64-NEXT:    retq
   %init = load i16, ptr @var16
   %big = zext i16 %init to i64
@@ -197,7 +197,7 @@ define i64 @not_useful_bswap() {
 ; CHECK64-LABEL: not_useful_bswap:
 ; CHECK64:       # %bb.0:
 ; CHECK64-NEXT:    movzbl var8(%rip), %eax
-; CHECK64-NEXT:    shll $8, %eax
+; CHECK64-NEXT:    shlq $8, %rax
 ; CHECK64-NEXT:    retq
   %init = load i8, ptr @var8
   %big = zext i8 %init to i64
@@ -224,9 +224,12 @@ define i64 @finally_useful_bswap() {
 ;
 ; CHECK64-LABEL: finally_useful_bswap:
 ; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    movzwl var16(%rip), %eax
-; CHECK64-NEXT:    bswapl %eax
-; CHECK64-NEXT:    shrl $16, %eax
+; CHECK64-NEXT:    movzwl var16(%rip), %ecx
+; CHECK64-NEXT:    movzbl %cl, %eax
+; CHECK64-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
+; CHECK64-NEXT:    shrl $8, %ecx
+; CHECK64-NEXT:    shlq $8, %rax
+; CHECK64-NEXT:    orq %rcx, %rax
 ; CHECK64-NEXT:    retq
   %init = load i16, ptr @var16
   %big = zext i16 %init to i64

diff  --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index a3568716edd9e9d..5500ad33230431c 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -832,7 +832,7 @@ define void @pr59781(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl (%rdi), %eax
 ; CHECK-NEXT:    movzbl 2(%rdi), %ecx
-; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    shlq $16, %rcx
 ; CHECK-NEXT:    orq %rax, %rcx
 ; CHECK-NEXT:    movq %rcx, (%rsi)
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll
index 5e030de1409f222..33834bc4470aeab 100644
--- a/llvm/test/CodeGen/X86/cmp-concat.ll
+++ b/llvm/test/CodeGen/X86/cmp-concat.ll
@@ -35,8 +35,8 @@ define i1 @cmp_anybits_concat_shl_shl_i16(i16 %x, i16 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl %di, %eax
 ; CHECK-NEXT:    movzwl %si, %ecx
-; CHECK-NEXT:    shll $8, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    shlq $8, %rcx
+; CHECK-NEXT:    orq %rax, %rcx
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %zx = zext i16 %x to i64
@@ -53,8 +53,8 @@ define i1 @cmp_anybits_concat_shl_shl_i16_commute(i16 %x, i16 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl %di, %eax
 ; CHECK-NEXT:    movzwl %si, %ecx
-; CHECK-NEXT:    shll $8, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    shlq $8, %rcx
+; CHECK-NEXT:    orq %rax, %rcx
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %zx = zext i16 %x to i64

diff  --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
index a5295d44b07c8e5..63061b0d851b6b7 100644
--- a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
+++ b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
@@ -67,7 +67,7 @@ define void @foo(ptr %arg3, i1 %icmp16) #0 {
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    callq *%rax
-; CHECK-NEXT:    shll $4, %r14d
+; CHECK-NEXT:    shlq $4, %r14
 ; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
 ; CHECK-NEXT:    movl %r13d, 0
 ; CHECK-NEXT:    movb $0, 4

diff  --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
index 9f81fab54a49d01..32579bd05605e7f 100644
--- a/llvm/test/CodeGen/X86/combine-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -368,20 +368,20 @@ define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind {
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $235867919, %ecx # imm = 0xE0F0F0F
-; X64-NEXT:    shll $4, %ecx
+; X64-NEXT:    shlq $4, %rcx
 ; X64-NEXT:    shrl $4, %eax
 ; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    orq %rcx, %rax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $590558003, %ecx # imm = 0x23333333
 ; X64-NEXT:    shrl $2, %eax
 ; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-NEXT:    leaq (%rax,%rcx,4), %rax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $357913941, %ecx # imm = 0x15555555
 ; X64-NEXT:    shrl %eax
 ; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    leaq (%rax,%rcx,2), %rax
 ; X64-NEXT:    retq
     %1 = call i64 @llvm.bitreverse.i64(i64 %a)
     %2 = shl i64 %1, 33

diff  --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
index 142ac754c3f7e8e..5c23c155ed85ff3 100644
--- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
@@ -1933,7 +1933,8 @@ define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) {
 ;
 ; X64-LABEL: test_i64_2147483647_mask_shl_1:
 ; X64:       # %bb.0:
-; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    andl $2147483647, %edi # imm = 0x7FFFFFFF
+; X64-NEXT:    leaq (%rdi,%rdi), %rax
 ; X64-NEXT:    retq
   %t0 = and i64 %a0, 2147483647
   %t1 = shl i64 %t0, 1

diff  --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
index 42b325dd4c229e0..6bfda6827d520a1 100644
--- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
@@ -146,7 +146,7 @@ define i64 @fun7(i8 zeroext %v) {
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    sarb $4, %dil
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    shlq $4, %rax
 ; X64-NEXT:    retq
 entry:
   %shr = ashr i8 %v, 4
@@ -166,7 +166,9 @@ define i64 @fun8(i16 zeroext %v) {
 ; X64-LABEL: fun8:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movswl %di, %eax
-; X64-NEXT:    andl $1048560, %eax # imm = 0xFFFF0
+; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    shlq $4, %rax
 ; X64-NEXT:    retq
 entry:
   %shr = ashr i16 %v, 4
@@ -215,12 +217,11 @@ define i64 @fun10(i8 zeroext %v) {
 ;
 ; X64-LABEL: fun10:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrb $4, %al
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    andl $-16, %edi
-; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    shrb $4, %dil
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    orq %rcx, %rax
 ; X64-NEXT:    retq
 entry:
   %shr = lshr i8 %v, 4
@@ -244,9 +245,9 @@ define i64 @fun11(i16 zeroext %v) {
 ; X64-LABEL: fun11:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    andl $-16, %edi
+; X64-NEXT:    shrl $4, %edi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shlq $4, %rax
 ; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    retq
 entry:
@@ -272,9 +273,9 @@ define i64 @fun12(i32 zeroext %v) {
 ; X64-LABEL: fun12:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    andl $-16, %edi
+; X64-NEXT:    shrl $4, %edi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shlq $4, %rax
 ; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    retq
 entry:

diff  --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 3796dd796eaf9d9..2e13776715e5bec 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -425,35 +425,36 @@ entry:
 define i128 @urem_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: urem_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    movq %rsi, %rcx
-; X86-64-NEXT:    shldq $62, %rdi, %rcx
+; X86-64-NEXT:    movq %rsi, %rax
+; X86-64-NEXT:    shldq $62, %rdi, %rax
 ; X86-64-NEXT:    shrq $2, %rsi
-; X86-64-NEXT:    addq %rsi, %rcx
-; X86-64-NEXT:    adcq $0, %rcx
-; X86-64-NEXT:    movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
-; X86-64-NEXT:    movq %rcx, %rax
-; X86-64-NEXT:    mulq %rdx
+; X86-64-NEXT:    addq %rax, %rsi
+; X86-64-NEXT:    adcq $0, %rsi
+; X86-64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT:    movq %rsi, %rax
+; X86-64-NEXT:    mulq %rcx
 ; X86-64-NEXT:    shrq %rdx
-; X86-64-NEXT:    leal (%rdx,%rdx,2), %eax
-; X86-64-NEXT:    subl %eax, %ecx
+; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; X86-64-NEXT:    subq %rax, %rsi
 ; X86-64-NEXT:    andl $3, %edi
-; X86-64-NEXT:    leaq (%rdi,%rcx,4), %rax
+; X86-64-NEXT:    leaq (%rdi,%rsi,4), %rax
 ; X86-64-NEXT:    xorl %edx, %edx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: urem_i128_12:
 ; WIN64:       # %bb.0: # %entry
 ; WIN64-NEXT:    movq %rdx, %r8
-; WIN64-NEXT:    shldq $62, %rcx, %r8
-; WIN64-NEXT:    shrq $2, %rdx
-; WIN64-NEXT:    addq %rdx, %r8
+; WIN64-NEXT:    movq %rdx, %rax
+; WIN64-NEXT:    shldq $62, %rcx, %rax
+; WIN64-NEXT:    shrq $2, %r8
+; WIN64-NEXT:    addq %rax, %r8
 ; WIN64-NEXT:    adcq $0, %r8
 ; WIN64-NEXT:    movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
 ; WIN64-NEXT:    movq %r8, %rax
 ; WIN64-NEXT:    mulq %rdx
 ; WIN64-NEXT:    shrq %rdx
-; WIN64-NEXT:    leal (%rdx,%rdx,2), %eax
-; WIN64-NEXT:    subl %eax, %r8d
+; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; WIN64-NEXT:    subq %rax, %r8
 ; WIN64-NEXT:    andl $3, %ecx
 ; WIN64-NEXT:    leaq (%rcx,%r8,4), %rax
 ; WIN64-NEXT:    xorl %edx, %edx

diff  --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll
index 90e075bfabf0a2b..38a1de251a3d919 100644
--- a/llvm/test/CodeGen/X86/extract-bits.ll
+++ b/llvm/test/CodeGen/X86/extract-bits.ll
@@ -8091,9 +8091,9 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind {
 ; X64-NOBMI-LABEL: pr38938:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl (%rsi), %eax
-; X64-NOBMI-NEXT:    shrl $19, %eax
-; X64-NOBMI-NEXT:    andl $4092, %eax # imm = 0xFFC
-; X64-NOBMI-NEXT:    incl (%rdi,%rax)
+; X64-NOBMI-NEXT:    shrl $21, %eax
+; X64-NOBMI-NEXT:    andl $1023, %eax # imm = 0x3FF
+; X64-NOBMI-NEXT:    incl (%rdi,%rax,4)
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMINOTBM-LABEL: pr38938:

diff  --git a/llvm/test/CodeGen/X86/fold-and-shift.ll b/llvm/test/CodeGen/X86/fold-and-shift.ll
index 985d7c6c82f06d2..41adab63a11a6f4 100644
--- a/llvm/test/CodeGen/X86/fold-and-shift.ll
+++ b/llvm/test/CodeGen/X86/fold-and-shift.ll
@@ -36,7 +36,8 @@ define i32 @t2(ptr %X, i32 %i) {
 ; X64-LABEL: t2:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movzwl %si, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %eax
+; X64-NEXT:    addl %eax, %eax
+; X64-NEXT:    movl (%rdi,%rax,2), %eax
 ; X64-NEXT:    retq
 entry:
   %tmp2 = shl i32 %i, 1

diff  --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll
index f48790de86fe9b5..b9e68036866210f 100644
--- a/llvm/test/CodeGen/X86/fp128-i128.ll
+++ b/llvm/test/CodeGen/X86/fp128-i128.ll
@@ -137,7 +137,7 @@ define fp128 @TestI128_1(fp128 %x) #0 {
 ; SSE-NEXT:    xorl %ecx, %ecx
 ; SSE-NEXT:    testl %eax, %eax
 ; SSE-NEXT:    sets %cl
-; SSE-NEXT:    shll $4, %ecx
+; SSE-NEXT:    shlq $4, %rcx
 ; SSE-NEXT:    movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rcx), %xmm0
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    retq
@@ -151,7 +151,7 @@ define fp128 @TestI128_1(fp128 %x) #0 {
 ; AVX-NEXT:    xorl %ecx, %ecx
 ; AVX-NEXT:    testl %eax, %eax
 ; AVX-NEXT:    sets %cl
-; AVX-NEXT:    shll $4, %ecx
+; AVX-NEXT:    shlq $4, %rcx
 ; AVX-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rcx), %xmm0
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/lea-dagdag.ll b/llvm/test/CodeGen/X86/lea-dagdag.ll
index f81851a92d8de0b..2705bd00f5d2cc3 100644
--- a/llvm/test/CodeGen/X86/lea-dagdag.ll
+++ b/llvm/test/CodeGen/X86/lea-dagdag.ll
@@ -199,7 +199,7 @@ define i64 @and_i32_zext_shl_add_i64_overshift(i64 %t0, i32 %t1) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
 ; CHECK-NEXT:    andl $8, %esi
-; CHECK-NEXT:    shll $4, %esi
+; CHECK-NEXT:    shlq $4, %rsi
 ; CHECK-NEXT:    leaq (%rsi,%rdi), %rax
 ; CHECK-NEXT:    retq
   %t4 = and i32 %t1, 8

diff  --git a/llvm/test/CodeGen/X86/lea-opt2.ll b/llvm/test/CodeGen/X86/lea-opt2.ll
index f7588577a3e9a18..cec19dcf49c8dfa 100644
--- a/llvm/test/CodeGen/X86/lea-opt2.ll
+++ b/llvm/test/CodeGen/X86/lea-opt2.ll
@@ -192,7 +192,7 @@ define void @test9(i64 %p, i64 %s) {
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testl $4095, %eax # imm = 0xFFF
 ; CHECK-NEXT:    setne %cl
-; CHECK-NEXT:    shll $12, %ecx
+; CHECK-NEXT:    shlq $12, %rcx
 ; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    andq $-4096, %rcx # imm = 0xF000
 ; CHECK-NEXT:    addq %rcx, %rdi

diff  --git a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
index a2e4e4784d36180..9f0c1ea1dc3f6f7 100644
--- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -28,10 +28,10 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:    movzbl %r8b, %r14d
 ; GENERIC-NEXT:    ## kill: def $r8d killed $r8d def $r8
 ; GENERIC-NEXT:    shrl $24, %r8d
-; GENERIC-NEXT:    movl %ebx, %r15d
-; GENERIC-NEXT:    shrl $14, %r15d
-; GENERIC-NEXT:    andl $1020, %r15d ## imm = 0x3FC
-; GENERIC-NEXT:    movl (%rax,%r15), %ebp
+; GENERIC-NEXT:    movl %ebx, %ebp
+; GENERIC-NEXT:    shrl $16, %ebp
+; GENERIC-NEXT:    movzbl %bpl, %r15d
+; GENERIC-NEXT:    movl (%rax,%r15,4), %ebp
 ; GENERIC-NEXT:    xorl (%rdi,%r8,4), %ebp
 ; GENERIC-NEXT:    xorl -12(%r9), %ebp
 ; GENERIC-NEXT:    shrl $24, %ebx
@@ -46,9 +46,9 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:  ## %bb.2: ## %bb1
 ; GENERIC-NEXT:    ## in Loop: Header=BB0_1 Depth=1
 ; GENERIC-NEXT:    movl %r14d, %ebx
-; GENERIC-NEXT:    shrl $14, %ebx
-; GENERIC-NEXT:    andl $1020, %ebx ## imm = 0x3FC
-; GENERIC-NEXT:    xorl (%rax,%rbx), %r8d
+; GENERIC-NEXT:    shrl $16, %ebx
+; GENERIC-NEXT:    movzbl %bl, %ebx
+; GENERIC-NEXT:    xorl (%rax,%rbx,4), %r8d
 ; GENERIC-NEXT:    xorl -4(%r9), %r8d
 ; GENERIC-NEXT:    shrl $24, %r14d
 ; GENERIC-NEXT:    movzbl %bpl, %ebx
@@ -61,9 +61,9 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:    shlq $4, %rcx
 ; GENERIC-NEXT:    andl $-16777216, %r8d ## imm = 0xFF000000
 ; GENERIC-NEXT:    movl %r14d, %r9d
-; GENERIC-NEXT:    shrl $14, %r9d
-; GENERIC-NEXT:    andl $1020, %r9d ## imm = 0x3FC
-; GENERIC-NEXT:    movzbl 2(%rax,%r9), %r9d
+; GENERIC-NEXT:    shrl $16, %r9d
+; GENERIC-NEXT:    movzbl %r9b, %r9d
+; GENERIC-NEXT:    movzbl 2(%rax,%r9,4), %r9d
 ; GENERIC-NEXT:    shll $16, %r9d
 ; GENERIC-NEXT:    orl %r8d, %r9d
 ; GENERIC-NEXT:    xorl 16(%rcx,%rdx), %r9d
@@ -93,6 +93,7 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ;
 ; ATOM-LABEL: t:
 ; ATOM:       ## %bb.0: ## %entry
+; ATOM-NEXT:    pushq %rbp
 ; ATOM-NEXT:    pushq %r15
 ; ATOM-NEXT:    pushq %r14
 ; ATOM-NEXT:    pushq %rbx
@@ -112,10 +113,10 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    movl %r8d, %r14d
 ; ATOM-NEXT:    movzbl %r8b, %r8d
 ; ATOM-NEXT:    shrl $24, %r15d
-; ATOM-NEXT:    shrl $14, %ebx
+; ATOM-NEXT:    shrl $16, %ebx
 ; ATOM-NEXT:    shrl $24, %r14d
-; ATOM-NEXT:    andl $1020, %ebx ## imm = 0x3FC
-; ATOM-NEXT:    movl (%rax,%rbx), %ebx
+; ATOM-NEXT:    movzbl %bl, %ebx
+; ATOM-NEXT:    movl (%rax,%rbx,4), %ebx
 ; ATOM-NEXT:    xorl (%rdi,%r14,4), %ebx
 ; ATOM-NEXT:    movl (%r10,%r8,4), %r14d
 ; ATOM-NEXT:    xorl -12(%r9), %ebx
@@ -128,12 +129,12 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    jb LBB0_3
 ; ATOM-NEXT:  ## %bb.2: ## %bb1
 ; ATOM-NEXT:    ## in Loop: Header=BB0_1 Depth=1
-; ATOM-NEXT:    movl %r14d, %r15d
+; ATOM-NEXT:    movl %r14d, %ebp
 ; ATOM-NEXT:    movzbl %bl, %ebx
 ; ATOM-NEXT:    shrl $24, %r14d
-; ATOM-NEXT:    shrl $14, %r15d
-; ATOM-NEXT:    andl $1020, %r15d ## imm = 0x3FC
-; ATOM-NEXT:    xorl (%rax,%r15), %r8d
+; ATOM-NEXT:    shrl $16, %ebp
+; ATOM-NEXT:    movzbl %bpl, %r15d
+; ATOM-NEXT:    xorl (%rax,%r15,4), %r8d
 ; ATOM-NEXT:    movl (%r10,%rbx,4), %r15d
 ; ATOM-NEXT:    xorl (%rdi,%r14,4), %r15d
 ; ATOM-NEXT:    xorl -4(%r9), %r8d
@@ -145,11 +146,11 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    andl $-16777216, %r8d ## imm = 0xFF000000
 ; ATOM-NEXT:    shrl $8, %r14d
 ; ATOM-NEXT:    shlq $4, %rcx
-; ATOM-NEXT:    shrl $14, %r9d
+; ATOM-NEXT:    shrl $16, %r9d
 ; ATOM-NEXT:    movzbl 3(%rdi,%r14,4), %edi
-; ATOM-NEXT:    andl $1020, %r9d ## imm = 0x3FC
+; ATOM-NEXT:    movzbl %r9b, %r9d
 ; ATOM-NEXT:    shll $24, %edi
-; ATOM-NEXT:    movzbl 2(%rax,%r9), %r9d
+; ATOM-NEXT:    movzbl 2(%rax,%r9,4), %r9d
 ; ATOM-NEXT:    shll $16, %r9d
 ; ATOM-NEXT:    orl %r8d, %r9d
 ; ATOM-NEXT:    movzbl %bl, %r8d
@@ -171,6 +172,7 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    popq %rbx
 ; ATOM-NEXT:    popq %r14
 ; ATOM-NEXT:    popq %r15
+; ATOM-NEXT:    popq %rbp
 ; ATOM-NEXT:    retq
 entry:
 	%0 = load i32, i32* %rk, align 4		; <i32> [#uses=1]

diff  --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll
index 420f5ba5ab43368..cada55919c8cea2 100644
--- a/llvm/test/CodeGen/X86/parity.ll
+++ b/llvm/test/CodeGen/X86/parity.ll
@@ -637,7 +637,7 @@ define i64 @parity_64_shift(i64 %0) {
 ; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
 ; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
 ; X64-NOPOPCNT-NEXT:    setnp %al
-; X64-NOPOPCNT-NEXT:    addl %eax, %eax
+; X64-NOPOPCNT-NEXT:    addq %rax, %rax
 ; X64-NOPOPCNT-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: parity_64_shift:
@@ -654,7 +654,7 @@ define i64 @parity_64_shift(i64 %0) {
 ; X64-POPCNT:       # %bb.0:
 ; X64-POPCNT-NEXT:    popcntq %rdi, %rax
 ; X64-POPCNT-NEXT:    andl $1, %eax
-; X64-POPCNT-NEXT:    addl %eax, %eax
+; X64-POPCNT-NEXT:    addq %rax, %rax
 ; X64-POPCNT-NEXT:    retq
   %2 = tail call i64 @llvm.ctpop.i64(i64 %0)
   %3 = shl nuw nsw i64 %2, 1

diff  --git a/llvm/test/CodeGen/X86/pr62653.ll b/llvm/test/CodeGen/X86/pr62653.ll
index b6a1bf47983dc76..0a03c1831f65791 100644
--- a/llvm/test/CodeGen/X86/pr62653.ll
+++ b/llvm/test/CodeGen/X86/pr62653.ll
@@ -4,117 +4,124 @@
 define <64 x i4> @pr62653(<64 x i4> %a0) nounwind {
 ; CHECK-LABEL: pr62653:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $r9d killed $r9d def $r9
+; CHECK-NEXT:    # kill: def $r8d killed $r8d def $r8
+; CHECK-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-NEXT:    andl $15, %edi
-; CHECK-NEXT:    shll $4, %edi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-NEXT:    andl $15, %r10d
+; CHECK-NEXT:    shlq $4, %r10
 ; CHECK-NEXT:    orq %rdi, %r10
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-NEXT:    andl $15, %edi
-; CHECK-NEXT:    shll $8, %edi
+; CHECK-NEXT:    shlq $8, %rdi
 ; CHECK-NEXT:    orq %r10, %rdi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-NEXT:    andl $15, %r10d
-; CHECK-NEXT:    shll $12, %r10d
+; CHECK-NEXT:    shlq $12, %r10
 ; CHECK-NEXT:    orq %rdi, %r10
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT:    andl $15, %r11d
+; CHECK-NEXT:    shlq $16, %r11
+; CHECK-NEXT:    orq %r10, %r11
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-NEXT:    andl $15, %edi
-; CHECK-NEXT:    shll $16, %edi
-; CHECK-NEXT:    orq %r10, %rdi
+; CHECK-NEXT:    shlq $20, %rdi
+; CHECK-NEXT:    orq %r11, %rdi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-NEXT:    andl $15, %r10d
-; CHECK-NEXT:    shll $20, %r10d
+; CHECK-NEXT:    shlq $24, %r10
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-NEXT:    andl $15, %r11d
-; CHECK-NEXT:    shll $24, %r11d
+; CHECK-NEXT:    shlq $28, %r11
 ; CHECK-NEXT:    orq %r10, %r11
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; CHECK-NEXT:    shll $28, %r10d
+; CHECK-NEXT:    andl $15, %r10d
+; CHECK-NEXT:    shlq $32, %r10
 ; CHECK-NEXT:    orq %r11, %r10
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-NEXT:    andl $15, %r11d
-; CHECK-NEXT:    shlq $32, %r11
+; CHECK-NEXT:    shlq $36, %r11
 ; CHECK-NEXT:    orq %r10, %r11
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-NEXT:    andl $15, %r10d
-; CHECK-NEXT:    shlq $36, %r10
+; CHECK-NEXT:    shlq $40, %r10
 ; CHECK-NEXT:    orq %r11, %r10
-; CHECK-NEXT:    orq %rdi, %r10
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
-; CHECK-NEXT:    andl $15, %edi
-; CHECK-NEXT:    shlq $40, %rdi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-NEXT:    andl $15, %r11d
 ; CHECK-NEXT:    shlq $44, %r11
+; CHECK-NEXT:    orq %r10, %r11
 ; CHECK-NEXT:    orq %rdi, %r11
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-NEXT:    andl $15, %edi
 ; CHECK-NEXT:    shlq $48, %rdi
-; CHECK-NEXT:    orq %r11, %rdi
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-NEXT:    andl $15, %r11d
-; CHECK-NEXT:    shlq $52, %r11
-; CHECK-NEXT:    orq %rdi, %r11
-; CHECK-NEXT:    orq %r10, %r11
-; CHECK-NEXT:    movq %r11, 8(%rax)
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
-; CHECK-NEXT:    andl $15, %edi
-; CHECK-NEXT:    shlq $32, %rdi
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT:    andl $15, %r10d
+; CHECK-NEXT:    shlq $52, %r10
+; CHECK-NEXT:    orq %rdi, %r10
+; CHECK-NEXT:    orq %r11, %r10
+; CHECK-NEXT:    movq %r10, 8(%rax)
 ; CHECK-NEXT:    andl $15, %esi
 ; CHECK-NEXT:    andl $15, %edx
-; CHECK-NEXT:    shll $4, %edx
-; CHECK-NEXT:    orl %esi, %edx
+; CHECK-NEXT:    shlq $4, %rdx
+; CHECK-NEXT:    orq %rsi, %rdx
 ; CHECK-NEXT:    andl $15, %ecx
-; CHECK-NEXT:    shll $8, %ecx
-; CHECK-NEXT:    orl %edx, %ecx
+; CHECK-NEXT:    shlq $8, %rcx
+; CHECK-NEXT:    orq %rdx, %rcx
 ; CHECK-NEXT:    andl $15, %r8d
-; CHECK-NEXT:    shll $12, %r8d
-; CHECK-NEXT:    orl %ecx, %r8d
+; CHECK-NEXT:    shlq $12, %r8
+; CHECK-NEXT:    orq %rcx, %r8
 ; CHECK-NEXT:    andl $15, %r9d
-; CHECK-NEXT:    shll $16, %r9d
-; CHECK-NEXT:    orl %r8d, %r9d
+; CHECK-NEXT:    shlq $16, %r9
+; CHECK-NEXT:    orq %r8, %r9
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
 ; CHECK-NEXT:    andl $15, %ecx
-; CHECK-NEXT:    shll $20, %ecx
-; CHECK-NEXT:    orl %r9d, %ecx
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
-; CHECK-NEXT:    andl $15, %edx
-; CHECK-NEXT:    shll $24, %edx
+; CHECK-NEXT:    shlq $20, %rcx
+; CHECK-NEXT:    orq %r9, %rcx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
-; CHECK-NEXT:    shll $28, %esi
-; CHECK-NEXT:    orl %edx, %esi
-; CHECK-NEXT:    orl %ecx, %esi
-; CHECK-NEXT:    orq %rdi, %rsi
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    andl $15, %ecx
-; CHECK-NEXT:    shlq $36, %rcx
-; CHECK-NEXT:    orq %rsi, %rcx
+; CHECK-NEXT:    andl $15, %esi
+; CHECK-NEXT:    shlq $24, %rsi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    andl $15, %edx
-; CHECK-NEXT:    shlq $40, %rdx
+; CHECK-NEXT:    shlq $28, %rdx
+; CHECK-NEXT:    orq %rsi, %rdx
 ; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
 ; CHECK-NEXT:    andl $15, %ecx
-; CHECK-NEXT:    shlq $44, %rcx
+; CHECK-NEXT:    shlq $32, %rcx
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT:    andl $15, %esi
+; CHECK-NEXT:    shlq $36, %rsi
+; CHECK-NEXT:    orq %rcx, %rsi
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    shlq $40, %rcx
+; CHECK-NEXT:    orq %rsi, %rcx
 ; CHECK-NEXT:    orq %rdx, %rcx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    andl $15, %edx
-; CHECK-NEXT:    shlq $48, %rdx
+; CHECK-NEXT:    shlq $44, %rdx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
 ; CHECK-NEXT:    andl $15, %esi
-; CHECK-NEXT:    shlq $52, %rsi
+; CHECK-NEXT:    shlq $48, %rsi
 ; CHECK-NEXT:    orq %rdx, %rsi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    andl $15, %edx
-; CHECK-NEXT:    shlq $56, %rdx
+; CHECK-NEXT:    shlq $52, %rdx
 ; CHECK-NEXT:    orq %rsi, %rdx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
-; CHECK-NEXT:    shlq $60, %rsi
+; CHECK-NEXT:    andl $15, %esi
+; CHECK-NEXT:    shlq $56, %rsi
 ; CHECK-NEXT:    orq %rdx, %rsi
 ; CHECK-NEXT:    orq %rcx, %rsi
-; CHECK-NEXT:    movq %rsi, (%rax)
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    shlq $60, %rcx
+; CHECK-NEXT:    orq %rsi, %rcx
+; CHECK-NEXT:    movq %rcx, (%rax)
 ; CHECK-NEXT:    retq
   %res = shufflevector <64 x i4> %a0, <64 x i4> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 64, i32 65, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <64 x i4> %res

diff  --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 6d2b73e2108ba22..213b2b018d0ad45 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -393,22 +393,22 @@ define void @test6(i32 %C, ptr %A, ptr %B) nounwind {
 define x86_fp80 @test7(i32 %tmp8) nounwind {
 ; GENERIC-LABEL: test7:
 ; GENERIC:       ## %bb.0:
-; GENERIC-NEXT:    ## kill: def $edi killed $edi def $rdi
-; GENERIC-NEXT:    notl %edi
-; GENERIC-NEXT:    shrl $27, %edi
-; GENERIC-NEXT:    andl $-16, %edi
-; GENERIC-NEXT:    leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; GENERIC-NEXT:    fldt (%rdi,%rax)
+; GENERIC-NEXT:    xorl %eax, %eax
+; GENERIC-NEXT:    testl %edi, %edi
+; GENERIC-NEXT:    setns %al
+; GENERIC-NEXT:    shlq $4, %rax
+; GENERIC-NEXT:    leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; GENERIC-NEXT:    fldt (%rax,%rcx)
 ; GENERIC-NEXT:    retq
 ;
 ; ATOM-LABEL: test7:
 ; ATOM:       ## %bb.0:
-; ATOM-NEXT:    ## kill: def $edi killed $edi def $rdi
-; ATOM-NEXT:    leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; ATOM-NEXT:    notl %edi
-; ATOM-NEXT:    shrl $27, %edi
-; ATOM-NEXT:    andl $-16, %edi
-; ATOM-NEXT:    fldt (%rdi,%rax)
+; ATOM-NEXT:    xorl %eax, %eax
+; ATOM-NEXT:    leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; ATOM-NEXT:    testl %edi, %edi
+; ATOM-NEXT:    setns %al
+; ATOM-NEXT:    shlq $4, %rax
+; ATOM-NEXT:    fldt (%rax,%rcx)
 ; ATOM-NEXT:    retq
 ;
 ; ATHLON-LABEL: test7:

diff  --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll
index eba22036701b4fe..dd054348ac01418 100644
--- a/llvm/test/CodeGen/X86/select_const.ll
+++ b/llvm/test/CodeGen/X86/select_const.ll
@@ -628,7 +628,7 @@ define i64 @select_pow2_
diff _neg_invert(i1 zeroext %cond) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorb $1, %dil
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    shll $7, %eax
+; X64-NEXT:    shlq $7, %rax
 ; X64-NEXT:    addq $-99, %rax
 ; X64-NEXT:    retq
   %sel = select i1 %cond, i64 -99, i64 29

diff  --git a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
index 03f4c0f61cdd1f5..220b7dc46dd946e 100644
--- a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
@@ -194,7 +194,7 @@ define i64 @sel_shift_bool_i64(i1 %t) {
 ; ANY:       # %bb.0:
 ; ANY-NEXT:    movl %edi, %eax
 ; ANY-NEXT:    andl $1, %eax
-; ANY-NEXT:    shll $16, %eax
+; ANY-NEXT:    shlq $16, %rax
 ; ANY-NEXT:    retq
   %shl = select i1 %t, i64 65536, i64 0
   ret i64 %shl

diff  --git a/llvm/test/CodeGen/X86/setcc.ll b/llvm/test/CodeGen/X86/setcc.ll
index 60ac6df3f77af29..c38318d5f6a253e 100644
--- a/llvm/test/CodeGen/X86/setcc.ll
+++ b/llvm/test/CodeGen/X86/setcc.ll
@@ -64,7 +64,7 @@ define i64 @t3(i64 %x) nounwind readnone ssp {
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq $18, %rdi
 ; X64-NEXT:    setb %al
-; X64-NEXT:    shll $6, %eax
+; X64-NEXT:    shlq $6, %rax
 ; X64-NEXT:    retq
   %t0 = icmp ult i64 %x, 18
   %if = select i1 %t0, i64 64, i64 0

diff  --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index cf45641fba63212..ab504d0a43fef53 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -15,8 +15,9 @@ define dso_local i32 @test_lshr_and(i32 %x) {
 ; X64-LABEL: test_lshr_and:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $12, %edi
-; X64-NEXT:    movl array(%rdi), %eax
+; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    movl array(,%rdi,4), %eax
 ; X64-NEXT:    retq
   %tmp2 = lshr i32 %x, 2
   %tmp3 = and i32 %tmp2, 3
@@ -103,8 +104,8 @@ define dso_local ptr @test_exact4(i32 %a, i32 %b, ptr %x)  {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    shrl %esi
-; X64-NEXT:    leaq (%rsi,%rdx), %rax
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    leaq (%rdx,%rsi,4), %rax
 ; X64-NEXT:    retq
   %sub = sub i32 %b, %a
   %shr = lshr exact i32 %sub, 3
@@ -125,8 +126,8 @@ define dso_local ptr @test_exact5(i32 %a, i32 %b, ptr %x)  {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    shrl %esi
-; X64-NEXT:    leaq (%rsi,%rdx), %rax
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    leaq (%rdx,%rsi,4), %rax
 ; X64-NEXT:    retq
   %sub = sub i32 %b, %a
   %shr = lshr exact i32 %sub, 3

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
index ce8d2acd035f6e0..3ca0e2121e0d12f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -255,28 +255,28 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSE2-NEXT:    andl $7, %r8d
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    andl $7, %r9d
-; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
-; SSE2-NEXT:    movd %r10d, %xmm0
-; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
@@ -299,28 +299,28 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSSE3-NEXT:    andl $7, %r8d
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSSE3-NEXT:    andl $7, %r9d
-; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
-; SSSE3-NEXT:    movd %r10d, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm3
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSSE3-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
index 8f78438dedf92da..f3bafec3399a72c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -293,52 +293,52 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX1-NEXT:    # kill: def $edx killed $edx def $rdx
 ; AVX1-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX1-NEXT:    # kill: def $edi killed $edi def $rdi
+; AVX1-NEXT:    andl $15, %edi
+; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX1-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $15, %r8d
+; AVX1-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $15, %r9d
+; AVX1-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0
+; AVX1-NEXT:    movl 16(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
+; AVX1-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    movl 24(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
+; AVX1-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 32(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vmovd %eax, %xmm1
 ; AVX1-NEXT:    movl 40(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl 48(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl 56(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl 64(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl 72(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl 80(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
-; AVX1-NEXT:    movl 88(%rbp), %eax
-; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
-; AVX1-NEXT:    andl $15, %edi
-; AVX1-NEXT:    movzwl (%rsp,%rdi,2), %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
-; AVX1-NEXT:    andl $15, %esi
-; AVX1-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
-; AVX1-NEXT:    andl $15, %edx
-; AVX1-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
-; AVX1-NEXT:    andl $15, %ecx
-; AVX1-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
-; AVX1-NEXT:    andl $15, %r8d
-; AVX1-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
-; AVX1-NEXT:    andl $15, %r9d
-; AVX1-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
-; AVX1-NEXT:    movl 16(%rbp), %eax
-; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movl 24(%rbp), %eax
+; AVX1-NEXT:    movl 88(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    movq %rbp, %rsp
 ; AVX1-NEXT:    popq %rbp
 ; AVX1-NEXT:    retq
@@ -355,52 +355,52 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
 ; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX2-NEXT:    # kill: def $edi killed $edi def $rdi
+; AVX2-NEXT:    andl $15, %edi
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX2-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    andl $15, %esi
+; AVX2-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $15, %r8d
+; AVX2-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $15, %r9d
+; AVX2-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0
+; AVX2-NEXT:    movl 16(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    movl 24(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 32(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    movl 40(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl 48(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl 56(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl 64(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl 72(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl 80(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
-; AVX2-NEXT:    movl 88(%rbp), %eax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
-; AVX2-NEXT:    andl $15, %edi
-; AVX2-NEXT:    movzwl (%rsp,%rdi,2), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    andl $15, %esi
-; AVX2-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
-; AVX2-NEXT:    andl $15, %edx
-; AVX2-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
-; AVX2-NEXT:    andl $15, %r8d
-; AVX2-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
-; AVX2-NEXT:    andl $15, %r9d
-; AVX2-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
-; AVX2-NEXT:    movl 16(%rbp), %eax
-; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movl 24(%rbp), %eax
+; AVX2-NEXT:    movl 88(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    movq %rbp, %rsp
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
@@ -448,52 +448,52 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
 ; AVX1-NEXT:    # kill: def $edx killed $edx def $rdx
 ; AVX1-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX1-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT:    andl $7, %eax
+; AVX1-NEXT:    andl $7, %edi
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    andl $7, %esi
+; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $7, %edx
+; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $7, %ecx
+; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $7, %r8d
+; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $7, %r9d
+; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
-; AVX1-NEXT:    andl $7, %edi
-; AVX1-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
-; AVX1-NEXT:    andl $7, %esi
-; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
-; AVX1-NEXT:    andl $7, %edx
-; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
-; AVX1-NEXT:    andl $7, %ecx
-; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
-; AVX1-NEXT:    andl $7, %r8d
-; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
-; AVX1-NEXT:    andl $7, %r9d
-; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
+; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
@@ -504,52 +504,52 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
 ; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
 ; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX2-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT:    andl $7, %eax
+; AVX2-NEXT:    andl $7, %edi
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    andl $7, %esi
+; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $7, %edx
+; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $7, %ecx
+; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $7, %r8d
+; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $7, %r9d
+; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
+; AVX2-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
-; AVX2-NEXT:    andl $7, %edi
-; AVX2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    andl $7, %esi
-; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
-; AVX2-NEXT:    andl $7, %edx
-; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
-; AVX2-NEXT:    andl $7, %ecx
-; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
-; AVX2-NEXT:    andl $7, %r8d
-; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
-; AVX2-NEXT:    andl $7, %r9d
-; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %x0  = extractelement <8 x i16> %x, i32 %i0
   %x1  = extractelement <8 x i16> %x, i32 %i1
@@ -597,13 +597,13 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, ptr %i) nounwin
 ; ALL-NEXT:    movq %rsp, %rbp
 ; ALL-NEXT:    andq $-32, %rsp
 ; ALL-NEXT:    subq $64, %rsp
-; ALL-NEXT:    movl (%rdi), %eax
-; ALL-NEXT:    movl 8(%rdi), %ecx
+; ALL-NEXT:    movq (%rdi), %rax
+; ALL-NEXT:    movq 8(%rdi), %rcx
 ; ALL-NEXT:    andl $3, %eax
 ; ALL-NEXT:    andl $3, %ecx
-; ALL-NEXT:    movl 16(%rdi), %edx
+; ALL-NEXT:    movq 16(%rdi), %rdx
 ; ALL-NEXT:    andl $3, %edx
-; ALL-NEXT:    movl 24(%rdi), %esi
+; ALL-NEXT:    movq 24(%rdi), %rsi
 ; ALL-NEXT:    andl $3, %esi
 ; ALL-NEXT:    vmovaps %ymm0, (%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -637,13 +637,13 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, ptr %i) nounwin
 define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, ptr %i) nounwind {
 ; ALL-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    movl (%rdi), %eax
-; ALL-NEXT:    movl 8(%rdi), %ecx
+; ALL-NEXT:    movq (%rdi), %rax
+; ALL-NEXT:    movq 8(%rdi), %rcx
 ; ALL-NEXT:    andl $1, %eax
 ; ALL-NEXT:    andl $1, %ecx
-; ALL-NEXT:    movl 16(%rdi), %edx
+; ALL-NEXT:    movq 16(%rdi), %rdx
 ; ALL-NEXT:    andl $1, %edx
-; ALL-NEXT:    movl 24(%rdi), %esi
+; ALL-NEXT:    movq 24(%rdi), %rsi
 ; ALL-NEXT:    andl $1, %esi
 ; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero

diff  --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 61434e942b3c535..0c57f497aa8aada 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -651,18 +651,18 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movq %xmm0, %rax
 ; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $15, %eax
+; SSE-NEXT:    shlq $15, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: vselect_any_extend_vector_inreg_crash:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    andl $1, %eax
-; AVX-NEXT:    shll $15, %eax
+; AVX-NEXT:    shlq $15, %rax
 ; AVX-NEXT:    retq
 0:
   %1 = load <8 x i8>, ptr %x

diff  --git a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
index 96e67a52b786af5..3f64a383abd2c91 100644
--- a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
+++ b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
@@ -14,7 +14,7 @@ define i64 @test1(ptr %data) {
 ;
 ; X64-LABEL: test1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzbl (%rdi), %eax
+; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    shll $2, %eax
 ; X64-NEXT:    andl $60, %eax
 ; X64-NEXT:    retq
@@ -37,7 +37,7 @@ define ptr @test2(ptr %data) {
 ;
 ; X64-LABEL: test2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movzbl (%rdi), %eax
+; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    andl $15, %eax
 ; X64-NEXT:    leaq (%rdi,%rax,4), %rax
 ; X64-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/zext-shl.ll b/llvm/test/CodeGen/X86/zext-shl.ll
index bc0981781df8ffa..8c27e0da6acf75c 100644
--- a/llvm/test/CodeGen/X86/zext-shl.ll
+++ b/llvm/test/CodeGen/X86/zext-shl.ll
@@ -51,7 +51,7 @@ define i64 @i64_zext_shift_i16_zext_i8(i8 %a0) nounwind {
 ; X64-LABEL: i64_zext_shift_i16_zext_i8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    shll $5, %eax
+; X64-NEXT:    shlq $5, %rax
 ; X64-NEXT:    retq
   %t0 = zext i8 %a0 to i16
   %t1 = shl i16 %t0, 5
@@ -112,7 +112,7 @@ define i128 @i128_zext_shift_i64_zext_i8(i8 %a0) nounwind {
 ; X64-LABEL: i128_zext_shift_i64_zext_i8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    shlq $4, %rax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    retq
   %t0 = zext i8 %a0 to i64
@@ -136,7 +136,7 @@ define i128 @i128_zext_shift_i64_zext_i16(i16 %a0) nounwind {
 ; X64-LABEL: i128_zext_shift_i64_zext_i16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    shll $7, %eax
+; X64-NEXT:    shlq $7, %rax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    retq
   %t0 = zext i16 %a0 to i64


        


More information about the llvm-commits mailing list