[llvm] [AMDGPU] Select 64-bit moves (PR #70395)

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 26 17:11:22 PDT 2023


https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/70395

This allows folding of 64-bit operands if fit into 32-bit. Fixes https://github.com/llvm/llvm-project/issues/67781

>From 2790f73e95aa0a0d8fb2c6c826f3897fb7bc3044 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 26 Oct 2023 15:04:09 -0700
Subject: [PATCH] [AMDGPU] Select 64-bit moves

This allows folding of 64-bit operands if fit into 32-bit.
Fixes https://github.com/llvm/llvm-project/issues/67781
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |   13 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |    8 +
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp    |    2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   15 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   20 +
 .../AMDGPU/GlobalISel/combine-short-clamp.ll  |    4 +-
 .../AMDGPU/GlobalISel/extractelement.ll       |  124 +-
 .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll  |   24 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   |  868 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   |  866 +-
 .../AMDGPU/GlobalISel/insertelement.i16.ll    |   52 +-
 .../AMDGPU/GlobalISel/insertelement.i8.ll     |   16 +
 .../AMDGPU/GlobalISel/insertelement.ll        |   24 +-
 ...inst-select-amdgpu-atomic-cmpxchg-flat.mir |  117 +-
 ...st-select-amdgpu-atomic-cmpxchg-global.mir |  136 +-
 .../inst-select-atomicrmw-add-flat.mir        |  330 +-
 .../inst-select-atomicrmw-add-global.mir      |  320 +-
 .../GlobalISel/inst-select-constant.mir       |  360 +-
 .../GlobalISel/inst-select-fconstant.mir      |   42 +-
 .../AMDGPU/GlobalISel/inst-select-fmul.mir    |   93 +-
 .../GlobalISel/inst-select-fract.f64.mir      |   13 +-
 .../inst-select-load-atomic-flat.mir          |  102 +-
 .../inst-select-load-atomic-global.mir        |  156 +-
 .../GlobalISel/inst-select-load-constant.mir  |   54 +-
 .../GlobalISel/inst-select-load-flat.mir      |  772 +-
 .../inst-select-load-global-saddr.mir         |   72 +-
 .../GlobalISel/inst-select-load-global.mir    |  708 +-
 .../GlobalISel/inst-select-load-smrd.mir      |    2 +-
 .../AMDGPU/GlobalISel/inst-select-ptrmask.mir |   36 +-
 .../GlobalISel/inst-select-store-flat.mir     |  120 +-
 .../GlobalISel/inst-select-store-global.mir   |   24 +-
 .../GlobalISel/llvm.amdgcn.div.scale.ll       |   26 +-
 .../llvm.amdgcn.global.atomic.csub.ll         |   30 +-
 .../GlobalISel/llvm.amdgcn.mfma.gfx90a.ll     |   36 +-
 .../CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll |    7 +-
 .../CodeGen/AMDGPU/GlobalISel/llvm.memset.ll  |    7 +-
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     |  305 +-
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     |  280 +-
 .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll     |  191 +-
 .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll     |  175 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     |   18 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      |  176 +-
 .../atomic_optimizations_local_pointer.ll     |   16 +-
 .../CodeGen/AMDGPU/combine_andor_with_cmps.ll |    3 +-
 llvm/test/CodeGen/AMDGPU/commute-compares.ll  |    4 +-
 llvm/test/CodeGen/AMDGPU/constrained-shift.ll |    3 +-
 llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll     |  824 +-
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |  115 +-
 .../AMDGPU/fold-short-64-bit-literals.mir     |    6 +-
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll     |   32 +-
 llvm/test/CodeGen/AMDGPU/fract-match.ll       |   15 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll         |  284 +-
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    |   42 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        |  324 +-
 .../AMDGPU/global_atomics_scan_fsub.ll        |  324 +-
 llvm/test/CodeGen/AMDGPU/inline-asm.ll        |    5 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |    9 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |   37 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll |   42 +-
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |   52 +-
 .../ipra-return-address-save-restore.ll       |    2 +-
 .../CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll    |  322 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll    |  236 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll    |  672 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll    |  120 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll    |  450 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll   |    4 +-
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |   72 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |   77 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   |  849 +-
 llvm/test/CodeGen/AMDGPU/offset-split-flat.ll |  861 +-
 .../CodeGen/AMDGPU/offset-split-global.ll     |  402 +-
 .../AMDGPU/reassoc-mul-add-1-to-mad.ll        |   68 +-
 llvm/test/CodeGen/AMDGPU/rsq.f64.ll           |  649 +-
 llvm/test/CodeGen/AMDGPU/salu-to-valu.ll      |    8 +-
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            |   15 +-
 llvm/test/CodeGen/AMDGPU/shl.ll               |   59 +-
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   | 7359 ++++++++---------
 .../AMDGPU/splitkit-getsubrangeformask.ll     |  326 +-
 llvm/test/CodeGen/AMDGPU/srem64.ll            |   15 +-
 llvm/test/CodeGen/AMDGPU/swdev380865.ll       |  108 +-
 .../AMDGPU/tuple-allocation-failure.ll        |  158 +-
 llvm/test/CodeGen/AMDGPU/udiv64.ll            |   15 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |   10 +-
 llvm/test/CodeGen/AMDGPU/urem64.ll            |   15 +-
 .../CodeGen/AMDGPU/use-sgpr-multiple-times.ll |    7 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |    6 +-
 87 files changed, 10103 insertions(+), 11663 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..804ffb90b530241 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -595,11 +595,15 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
       break;
 
     uint64_t Imm;
-    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
-    else {
+      if (AMDGPU::isValid32BitLiteral(Imm, true))
+        break;
+    } else {
       ConstantSDNode *C = cast<ConstantSDNode>(N);
       Imm = C->getZExtValue();
+      if (AMDGPU::isValid32BitLiteral(Imm, false))
+        break;
     }
 
     SDLoc DL(N);
@@ -3014,7 +3018,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
     if (!RC || SIRI->isSGPRClass(RC))
       return false;
 
-    if (RC != &AMDGPU::VS_32RegClass) {
+    if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
       AllUsesAcceptSReg = false;
       SDNode * User = *U;
       if (User->isMachineOpcode()) {
@@ -3026,7 +3030,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
             const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
-            if (CommutedRC == &AMDGPU::VS_32RegClass)
+            if (CommutedRC == &AMDGPU::VS_32RegClass ||
+                CommutedRC == &AMDGPU::VS_64RegClass)
               AllUsesAcceptSReg = true;
           }
         }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8a6..b68096decbb7db9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2551,11 +2551,13 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
   MachineOperand &ImmOp = I.getOperand(1);
   Register DstReg = I.getOperand(0).getReg();
   unsigned Size = MRI->getType(DstReg).getSizeInBits();
+  bool IsFP = false;
 
   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
   if (ImmOp.isFPImm()) {
     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
     ImmOp.ChangeToImmediate(Imm.getZExtValue());
+    IsFP = true;
   } else if (ImmOp.isCImm()) {
     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
   } else {
@@ -2568,6 +2570,12 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
   unsigned Opcode;
   if (DstRB->getID() == AMDGPU::VCCRegBankID) {
     Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+  } else if (Size == 64 &&
+              AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
+    Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
+    I.setDesc(TII.get(Opcode));
+    I.addImplicitDefUseOperands(*MF);
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   } else {
     Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
 
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index b32ed9fef5dd34e..b7ac90e33f65e00 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -367,7 +367,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
     SMovOp = AMDGPU::S_MOV_B32;
     break;
   case AMDGPU::V_MOV_B64_PSEUDO:
-    SMovOp = AMDGPU::S_MOV_B64;
+    SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO;
     break;
   }
   Imm = ImmOp->getImm();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 827c2c156638468..284943eae46500d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5497,9 +5497,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
-    if (Is64BitOp && !AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
-        !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm()))
-      return false;
+    if (Is64BitOp &&
+        !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
+      if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
+        return false;
+
+      // FIXME: We can use sign extended 64-bit literals, but only for signed
+      //        operands. At the moment we do not know if an operand is signed.
+      //        Such operand will be encoded as its low 32 bits and then either
+      //        correctly sign extended or incorrectly zero extended by HW.
+      if (!Is64BitFPOp && (int32_t)Lo_32(Imm) < 0)
+        return false;
+    }
   }
 
   // Handle non-register types that are treated like immediates.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 567f1b812c1808c..f93b827ec17ab1f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1934,6 +1934,26 @@ def : GCNPat <
   (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+def : GCNPat <
+  (VGPRImm<(i64 imm)>:$imm),
+  (V_MOV_B64_PSEUDO imm:$imm)
+>;
+
+def : GCNPat <
+  (VGPRImm<(f64 fpimm)>:$imm),
+  (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm)))
+>;
+
+def : GCNPat <
+  (i64 imm:$imm),
+  (S_MOV_B64_IMM_PSEUDO imm:$imm)
+>;
+
+def : GCNPat <
+  (f64 fpimm:$imm),
+  (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm)))
+>;
+
 def : GCNPat <
   (f32 fpimm:$imm),
   (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
index ed525fb83c6de82..621394fd290b0c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
@@ -41,11 +41,12 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}v_clamp_i64_i16_invalid_lower
+; GFX6789: v_mov_b32_e32 v{{[0-9]+}}, 0x8001
 ; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001
 ; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
 ; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc
 
-; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo
+; GFX10: v_{{(dual_)?}}cndmask_b32{{(_e32)?}} [[A:v[0-9]+]], 0x8001, [[A]]
 ; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo
 define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 {
 entry:
@@ -56,6 +57,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher
+; GFX6789: v_mov_b32_e32 v{{[0-9]+}}, 0x8000
 ; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000
 ; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
 ; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 701a733d9e8e957..8bf34caea40513d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2090,69 +2090,69 @@ define amdgpu_ps double @dyn_extract_v16f64_s_s(i32 inreg %sel) {
 ; GCN-LABEL: dyn_extract_v16f64_s_s:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_mov_b32 s66, 0
+; GCN-NEXT:    s_mov_b32 s64, 0
+; GCN-NEXT:    s_mov_b32 s62, 0
+; GCN-NEXT:    s_mov_b32 s60, 0
+; GCN-NEXT:    s_mov_b32 s58, 0
+; GCN-NEXT:    s_mov_b32 s56, 0
+; GCN-NEXT:    s_mov_b32 s54, 0
+; GCN-NEXT:    s_mov_b32 s52, 0
+; GCN-NEXT:    s_mov_b32 s50, 0
+; GCN-NEXT:    s_mov_b32 s48, 0
+; GCN-NEXT:    s_mov_b32 s46, 0
+; GCN-NEXT:    s_mov_b32 s44, 0
+; GCN-NEXT:    s_mov_b32 s40, 0
 ; GCN-NEXT:    s_mov_b64 s[36:37], 1.0
 ; GCN-NEXT:    s_mov_b32 m0, s2
 ; GCN-NEXT:    s_mov_b32 s67, 0x40300000
 ; GCN-NEXT:    s_mov_b32 s65, 0x402e0000
-; GCN-NEXT:    s_mov_b32 s64, s66
 ; GCN-NEXT:    s_mov_b32 s63, 0x402c0000
-; GCN-NEXT:    s_mov_b32 s62, s66
 ; GCN-NEXT:    s_mov_b32 s61, 0x402a0000
-; GCN-NEXT:    s_mov_b32 s60, s66
 ; GCN-NEXT:    s_mov_b32 s59, 0x40280000
-; GCN-NEXT:    s_mov_b32 s58, s66
 ; GCN-NEXT:    s_mov_b32 s57, 0x40260000
-; GCN-NEXT:    s_mov_b32 s56, s66
 ; GCN-NEXT:    s_mov_b32 s55, 0x40240000
-; GCN-NEXT:    s_mov_b32 s54, s66
 ; GCN-NEXT:    s_mov_b32 s53, 0x40220000
-; GCN-NEXT:    s_mov_b32 s52, s66
 ; GCN-NEXT:    s_mov_b32 s51, 0x40200000
-; GCN-NEXT:    s_mov_b32 s50, s66
 ; GCN-NEXT:    s_mov_b32 s49, 0x401c0000
-; GCN-NEXT:    s_mov_b32 s48, s66
 ; GCN-NEXT:    s_mov_b32 s47, 0x40180000
-; GCN-NEXT:    s_mov_b32 s46, s66
 ; GCN-NEXT:    s_mov_b32 s45, 0x40140000
-; GCN-NEXT:    s_mov_b32 s44, s66
 ; GCN-NEXT:    s_mov_b64 s[42:43], 4.0
 ; GCN-NEXT:    s_mov_b32 s41, 0x40080000
-; GCN-NEXT:    s_mov_b32 s40, s66
 ; GCN-NEXT:    s_mov_b64 s[38:39], 2.0
 ; GCN-NEXT:    s_movrels_b64 s[0:1], s[36:37]
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: dyn_extract_v16f64_s_s:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_mov_b32 s66, 0
 ; GFX10PLUS-NEXT:    s_mov_b64 s[36:37], 1.0
 ; GFX10PLUS-NEXT:    s_mov_b32 m0, s2
+; GFX10PLUS-NEXT:    s_mov_b32 s66, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s64, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s62, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s60, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s58, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s56, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s54, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s52, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s50, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s48, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s46, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s44, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s40, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s67, 0x40300000
 ; GFX10PLUS-NEXT:    s_mov_b32 s65, 0x402e0000
-; GFX10PLUS-NEXT:    s_mov_b32 s64, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s63, 0x402c0000
-; GFX10PLUS-NEXT:    s_mov_b32 s62, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s61, 0x402a0000
-; GFX10PLUS-NEXT:    s_mov_b32 s60, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s59, 0x40280000
-; GFX10PLUS-NEXT:    s_mov_b32 s58, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s57, 0x40260000
-; GFX10PLUS-NEXT:    s_mov_b32 s56, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s55, 0x40240000
-; GFX10PLUS-NEXT:    s_mov_b32 s54, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s53, 0x40220000
-; GFX10PLUS-NEXT:    s_mov_b32 s52, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s51, 0x40200000
-; GFX10PLUS-NEXT:    s_mov_b32 s50, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s49, 0x401c0000
-; GFX10PLUS-NEXT:    s_mov_b32 s48, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s47, 0x40180000
-; GFX10PLUS-NEXT:    s_mov_b32 s46, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s45, 0x40140000
-; GFX10PLUS-NEXT:    s_mov_b32 s44, s66
 ; GFX10PLUS-NEXT:    s_mov_b64 s[42:43], 4.0
 ; GFX10PLUS-NEXT:    s_mov_b32 s41, 0x40080000
-; GFX10PLUS-NEXT:    s_mov_b32 s40, s66
 ; GFX10PLUS-NEXT:    s_mov_b64 s[38:39], 2.0
 ; GFX10PLUS-NEXT:    s_movrels_b64 s[0:1], s[36:37]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
@@ -3085,10 +3085,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:  ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GPRIDX-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GPRIDX-NEXT:    s_mov_b32 s4, 0
+; GPRIDX-NEXT:    s_mov_b32 s5, 0x40080000
 ; GPRIDX-NEXT:    s_mov_b32 s2, 0
 ; GPRIDX-NEXT:    s_mov_b32 s3, 0x40140000
-; GPRIDX-NEXT:    s_mov_b32 s5, 0x40080000
-; GPRIDX-NEXT:    s_mov_b32 s4, s2
 ; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
 ; GPRIDX-NEXT:    s_cmp_eq_u32 s8, 1
 ; GPRIDX-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
@@ -3176,10 +3176,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; MOVREL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; MOVREL-NEXT:    s_mov_b32 s4, 0
+; MOVREL-NEXT:    s_mov_b32 s5, 0x40080000
 ; MOVREL-NEXT:    s_mov_b32 s2, 0
 ; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
-; MOVREL-NEXT:    s_mov_b32 s5, 0x40080000
-; MOVREL-NEXT:    s_mov_b32 s4, s2
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
 ; MOVREL-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
@@ -3207,7 +3207,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -3250,7 +3250,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 12
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 9
+; GFX10-NEXT:     wavefront_sgpr_count = 7
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -3267,22 +3267,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:    .end_amd_kernel_code_t
 ; GFX10-NEXT:  ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x8
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:    s_mov_b32 s3, 0x40140000
-; GFX10-NEXT:    s_mov_b32 s5, 0x40080000
-; GFX10-NEXT:    s_mov_b32 s4, s2
+; GFX10-NEXT:    s_mov_b32 s3, 0x40080000
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 1
-; GFX10-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 2
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 3
-; GFX10-NEXT:    s_cselect_b64 s[4:5], 4.0, s[4:5]
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 4
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX10-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 2
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0x40140000
+; GFX10-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 4
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3299,7 +3299,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -3342,7 +3342,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 12
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 9
+; GFX11-NEXT:     wavefront_sgpr_count = 7
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -3359,22 +3359,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:    .end_amd_kernel_code_t
 ; GFX11-NEXT:  ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s8, s[0:1], 0x8
+; GFX11-NEXT:    s_load_b32 s6, s[0:1], 0x8
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0x40140000
-; GFX11-NEXT:    s_mov_b32 s5, 0x40080000
-; GFX11-NEXT:    s_mov_b32 s4, s2
+; GFX11-NEXT:    s_mov_b32 s3, 0x40080000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 1
-; GFX11-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 2
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 3
-; GFX11-NEXT:    s_cselect_b64 s[4:5], 4.0, s[4:5]
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 4
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX11-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 2
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_mov_b32 s5, 0x40140000
+; GFX11-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 4
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
@@ -4784,11 +4784,8 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
 ; MOVREL-LABEL: v_extract_v64i32_32:
 ; MOVREL:       ; %bb.0:
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], 0x80
-; MOVREL-NEXT:    v_mov_b32_e32 v2, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v3, s5
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
+; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
@@ -4823,11 +4820,8 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) {
 ; MOVREL-LABEL: v_extract_v64i32_33:
 ; MOVREL:       ; %bb.0:
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], 0x80
-; MOVREL-NEXT:    v_mov_b32_e32 v2, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v3, s5
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
+; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 66bff4a14cac84b..c6ea046f95a9199 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1473,12 +1473,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1504,12 +1504,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1549,12 +1549,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1748,12 +1748,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 139bb40daa930a8..056629ca354518b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -5824,29 +5824,28 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: s_fshl_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b64 s[10:11], 0x7f
-; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX6-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX6-NEXT:    s_sub_i32 s9, s12, 64
-; GFX6-NEXT:    s_sub_i32 s10, 64, s12
-; GFX6-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX6-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX6-NEXT:    s_sub_i32 s9, s10, 64
+; GFX6-NEXT:    s_sub_i32 s11, 64, s10
+; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s12
-; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s10
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
-; GFX6-NEXT:    s_or_b64 s[12:13], s[16:17], s[12:13]
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
+; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX6-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_mov_b32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX6-NEXT:    s_lshl_b32 s5, s6, 31
-; GFX6-NEXT:    s_mov_b32 s4, s11
-; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
 ; GFX6-NEXT:    s_sub_i32 s12, s8, 64
 ; GFX6-NEXT:    s_sub_i32 s10, 64, s8
@@ -5871,29 +5870,28 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX8-LABEL: s_fshl_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[10:11], 0x7f
-; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX8-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX8-NEXT:    s_sub_i32 s9, s12, 64
-; GFX8-NEXT:    s_sub_i32 s10, 64, s12
-; GFX8-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX8-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX8-NEXT:    s_sub_i32 s9, s10, 64
+; GFX8-NEXT:    s_sub_i32 s11, 64, s10
+; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s12
-; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s10
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
-; GFX8-NEXT:    s_or_b64 s[12:13], s[16:17], s[12:13]
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
+; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX8-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_mov_b32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX8-NEXT:    s_lshl_b32 s5, s6, 31
-; GFX8-NEXT:    s_mov_b32 s4, s11
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
 ; GFX8-NEXT:    s_sub_i32 s12, s8, 64
 ; GFX8-NEXT:    s_sub_i32 s10, 64, s8
@@ -5918,29 +5916,28 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX9-LABEL: s_fshl_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b64 s[10:11], 0x7f
-; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX9-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX9-NEXT:    s_sub_i32 s9, s12, 64
-; GFX9-NEXT:    s_sub_i32 s10, 64, s12
-; GFX9-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX9-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX9-NEXT:    s_sub_i32 s9, s10, 64
+; GFX9-NEXT:    s_sub_i32 s11, 64, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s12
-; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s10
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
-; GFX9-NEXT:    s_or_b64 s[12:13], s[16:17], s[12:13]
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
+; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX9-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_mov_b32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX9-NEXT:    s_lshl_b32 s5, s6, 31
-; GFX9-NEXT:    s_mov_b32 s4, s11
-; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
 ; GFX9-NEXT:    s_sub_i32 s12, s8, 64
 ; GFX9-NEXT:    s_sub_i32 s10, 64, s8
@@ -5965,40 +5962,39 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX10-LABEL: s_fshl_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b64 s[10:11], 0x7f
-; GFX10-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX10-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX10-NEXT:    s_sub_i32 s9, s12, 64
-; GFX10-NEXT:    s_sub_i32 s10, 64, s12
-; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX10-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX10-NEXT:    s_sub_i32 s9, s10, 64
+; GFX10-NEXT:    s_sub_i32 s11, 64, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX10-NEXT:    s_mov_b32 s12, 0
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[0:1], s10
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s12
-; GFX10-NEXT:    s_lshl_b64 s[12:13], s[0:1], s12
+; GFX10-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
 ; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX10-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX10-NEXT:    s_lshl_b32 s5, s6, 31
-; GFX10-NEXT:    s_mov_b32 s4, s11
-; GFX10-NEXT:    s_sub_i32 s14, s8, 64
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT:    s_lshl_b32 s13, s6, 31
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX10-NEXT:    s_sub_i32 s14, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[4:5], s9
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
@@ -6006,47 +6002,45 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
-; GFX10-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[10:11], 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:    s_sub_i32 s9, s12, 64
-; GFX11-NEXT:    s_sub_i32 s10, 64, s12
-; GFX11-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX11-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[8:9], 0x7f, s[8:9]
+; GFX11-NEXT:    s_sub_i32 s9, s10, 64
+; GFX11-NEXT:    s_sub_i32 s11, 64, s10
+; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX11-NEXT:    s_mov_b32 s12, 0
+; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[0:1], s10
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[2:3], s12
-; GFX11-NEXT:    s_lshl_b64 s[12:13], s[0:1], s12
+; GFX11-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
 ; GFX11-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX11-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX11-NEXT:    s_lshl_b32 s5, s6, 31
-; GFX11-NEXT:    s_mov_b32 s4, s11
-; GFX11-NEXT:    s_sub_i32 s14, s8, 64
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX11-NEXT:    s_lshl_b32 s13, s6, 31
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX11-NEXT:    s_sub_i32 s14, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[4:5], s9
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
@@ -6054,7 +6048,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
-; GFX11-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
+; GFX11-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6575,23 +6569,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshl_i128_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s5, s8, 64
-; GFX6-NEXT:    s_sub_i32 s9, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX6-NEXT:    s_sub_i32 s5, s6, 64
+; GFX6-NEXT:    s_sub_i32 s7, 64, s6
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], s8
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -6605,14 +6598,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s4
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s4
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
 ; GFX6-NEXT:    s_and_b32 s0, 1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s8
+; GFX6-NEXT:    s_and_b32 s0, 1, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6620,31 +6613,30 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s5, s8, 64
-; GFX8-NEXT:    s_sub_i32 s9, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX8-NEXT:    s_sub_i32 s5, s6, 64
+; GFX8-NEXT:    s_sub_i32 s7, 64, s6
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], s8
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -6658,14 +6650,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s5
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s8
+; GFX8-NEXT:    s_and_b32 s0, 1, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6673,32 +6665,31 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s5, s8, 64
-; GFX9-NEXT:    s_sub_i32 s9, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX9-NEXT:    s_sub_i32 s5, s6, 64
+; GFX9-NEXT:    s_sub_i32 s7, 64, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], s8
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
@@ -6710,14 +6701,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s5
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s8
+; GFX9-NEXT:    s_and_b32 s0, 1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6725,36 +6716,35 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i128_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b64 s[6:7], 0x7f
+; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX10-NEXT:    s_sub_i32 s5, s6, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_sub_i32 s5, s8, 64
-; GFX10-NEXT:    s_sub_i32 s6, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s8
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_sub_i32 s0, 64, s4
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
@@ -6779,34 +6769,33 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX10-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshl_i128_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[6:7], 0x7f
+; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
+; GFX11-NEXT:    s_sub_i32 s5, s6, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], s[6:7], s[4:5]
-; GFX11-NEXT:    s_sub_i32 s5, s8, 64
-; GFX11-NEXT:    s_sub_i32 s6, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s8
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
+; GFX11-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
+; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_sub_i32 s0, 64, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -6833,9 +6822,9 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, s6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6845,23 +6834,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshl_i128_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s5, s8, 64
-; GFX6-NEXT:    s_sub_i32 s6, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX6-NEXT:    s_sub_i32 s5, s6, 64
+; GFX6-NEXT:    s_sub_i32 s7, 64, s6
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s6
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s8
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s8
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s7
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s6
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s5
 ; GFX6-NEXT:    s_and_b32 s5, 1, s9
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX6-NEXT:    s_mov_b32 s8, s7
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s6
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_and_b32 s5, 1, s10
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
@@ -6900,23 +6888,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX8-LABEL: v_fshl_i128_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s5, s8, 64
-; GFX8-NEXT:    s_sub_i32 s6, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX8-NEXT:    s_sub_i32 s5, s6, 64
+; GFX8-NEXT:    s_sub_i32 s7, 64, s6
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_mov_b32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX8-NEXT:    s_and_b32 s5, 1, s9
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX8-NEXT:    s_mov_b32 s8, s7
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_and_b32 s5, 1, s10
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
@@ -6955,23 +6942,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX9-LABEL: v_fshl_i128_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s5, s8, 64
-; GFX9-NEXT:    s_sub_i32 s6, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX9-NEXT:    s_sub_i32 s5, s6, 64
+; GFX9-NEXT:    s_sub_i32 s7, 64, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_mov_b32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX9-NEXT:    s_and_b32 s5, 1, s9
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX9-NEXT:    s_mov_b32 s8, s7
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_and_b32 s5, 1, s10
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
@@ -7010,39 +6996,38 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX10-LABEL: v_fshl_i128_vss:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_sub_i32 s5, s8, 64
-; GFX10-NEXT:    s_sub_i32 s6, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
+; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX10-NEXT:    s_sub_i32 s5, s6, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX10-NEXT:    s_and_b32 s6, 1, s9
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
+; GFX10-NEXT:    s_and_b32 s6, 1, s8
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX10-NEXT:    s_mov_b32 s8, s7
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_lshl_b32 s7, s2, 31
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT:    s_and_b32 s5, 1, s10
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX10-NEXT:    s_and_b32 s5, 1, s9
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX10-NEXT:    s_sub_i32 s10, s4, 64
 ; GFX10-NEXT:    s_sub_i32 s8, 64, s4
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
@@ -7065,40 +7050,38 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX11-LABEL: v_fshl_i128_vss:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], s[6:7], s[4:5]
-; GFX11-NEXT:    s_sub_i32 s5, s8, 64
-; GFX11-NEXT:    s_sub_i32 s6, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
+; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
+; GFX11-NEXT:    s_sub_i32 s5, s6, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX11-NEXT:    s_and_b32 s6, 1, s9
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
+; GFX11-NEXT:    s_and_b32 s6, 1, s8
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX11-NEXT:    s_mov_b32 s8, s7
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_lshl_b32 s7, s2, 31
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT:    s_and_b32 s5, 1, s10
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX11-NEXT:    s_and_b32 s5, 1, s9
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX11-NEXT:    s_sub_i32 s10, s4, 64
 ; GFX11-NEXT:    s_sub_i32 s8, 64, s4
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
 ; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
@@ -7243,71 +7226,69 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshl_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b64 s[18:19], 0x7f
-; GFX6-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX6-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX6-NEXT:    s_sub_i32 s17, s22, 64
-; GFX6-NEXT:    s_sub_i32 s23, 64, s22
-; GFX6-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX6-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
+; GFX6-NEXT:    s_sub_i32 s17, s18, 64
+; GFX6-NEXT:    s_sub_i32 s19, 64, s18
+; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX6-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], s22
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[0:1], s23
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
-; GFX6-NEXT:    s_or_b64 s[22:23], s[26:27], s[22:23]
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
+; GFX6-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
+; GFX6-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX6-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_mov_b32 s22, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX6-NEXT:    s_lshl_b32 s9, s10, 31
-; GFX6-NEXT:    s_mov_b32 s8, s19
-; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX6-NEXT:    s_sub_i32 s26, s16, 64
-; GFX6-NEXT:    s_sub_i32 s22, 64, s16
+; GFX6-NEXT:    s_sub_i32 s23, s16, 64
+; GFX6-NEXT:    s_sub_i32 s18, 64, s16
 ; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
 ; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
-; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX6-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX6-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX6-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX6-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX6-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
 ; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX6-NEXT:    s_lshl_b32 s9, s14, 31
-; GFX6-NEXT:    s_mov_b32 s8, s19
-; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_lshl_b32 s23, s14, 31
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
 ; GFX6-NEXT:    s_sub_i32 s18, s10, 64
 ; GFX6-NEXT:    s_sub_i32 s14, 64, s10
@@ -7332,71 +7313,69 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX8-LABEL: s_fshl_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[18:19], 0x7f
-; GFX8-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX8-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX8-NEXT:    s_sub_i32 s17, s22, 64
-; GFX8-NEXT:    s_sub_i32 s23, 64, s22
-; GFX8-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX8-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
+; GFX8-NEXT:    s_sub_i32 s17, s18, 64
+; GFX8-NEXT:    s_sub_i32 s19, 64, s18
+; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], s22
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[0:1], s23
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
-; GFX8-NEXT:    s_or_b64 s[22:23], s[26:27], s[22:23]
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
+; GFX8-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
+; GFX8-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_mov_b32 s22, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX8-NEXT:    s_lshl_b32 s9, s10, 31
-; GFX8-NEXT:    s_mov_b32 s8, s19
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX8-NEXT:    s_sub_i32 s26, s16, 64
-; GFX8-NEXT:    s_sub_i32 s22, 64, s16
+; GFX8-NEXT:    s_sub_i32 s23, s16, 64
+; GFX8-NEXT:    s_sub_i32 s18, 64, s16
 ; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
 ; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
-; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX8-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX8-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX8-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX8-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX8-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX8-NEXT:    s_lshl_b32 s9, s14, 31
-; GFX8-NEXT:    s_mov_b32 s8, s19
-; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_lshl_b32 s23, s14, 31
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
 ; GFX8-NEXT:    s_sub_i32 s18, s10, 64
 ; GFX8-NEXT:    s_sub_i32 s14, 64, s10
@@ -7421,71 +7400,69 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX9-LABEL: s_fshl_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b64 s[18:19], 0x7f
-; GFX9-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX9-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX9-NEXT:    s_sub_i32 s17, s22, 64
-; GFX9-NEXT:    s_sub_i32 s23, 64, s22
-; GFX9-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX9-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
+; GFX9-NEXT:    s_sub_i32 s17, s18, 64
+; GFX9-NEXT:    s_sub_i32 s19, 64, s18
+; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], s22
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[0:1], s23
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
-; GFX9-NEXT:    s_or_b64 s[22:23], s[26:27], s[22:23]
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
+; GFX9-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
+; GFX9-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX9-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_mov_b32 s22, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX9-NEXT:    s_lshl_b32 s9, s10, 31
-; GFX9-NEXT:    s_mov_b32 s8, s19
-; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX9-NEXT:    s_sub_i32 s26, s16, 64
-; GFX9-NEXT:    s_sub_i32 s22, 64, s16
+; GFX9-NEXT:    s_sub_i32 s23, s16, 64
+; GFX9-NEXT:    s_sub_i32 s18, 64, s16
 ; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
 ; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
-; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX9-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX9-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX9-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX9-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX9-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX9-NEXT:    s_lshl_b32 s9, s14, 31
-; GFX9-NEXT:    s_mov_b32 s8, s19
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_lshl_b32 s23, s14, 31
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
 ; GFX9-NEXT:    s_sub_i32 s18, s10, 64
 ; GFX9-NEXT:    s_sub_i32 s14, 64, s10
@@ -7510,73 +7487,71 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX10-LABEL: s_fshl_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b64 s[18:19], 0x7f
-; GFX10-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX10-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX10-NEXT:    s_sub_i32 s17, s22, 64
-; GFX10-NEXT:    s_sub_i32 s23, 64, s22
-; GFX10-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX10-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
+; GFX10-NEXT:    s_sub_i32 s17, s18, 64
+; GFX10-NEXT:    s_sub_i32 s19, 64, s18
+; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX10-NEXT:    s_mov_b32 s22, 0
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX10-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s23
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s22
-; GFX10-NEXT:    s_lshl_b64 s[22:23], s[0:1], s22
+; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
+; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
 ; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX10-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX10-NEXT:    s_lshl_b32 s9, s10, 31
-; GFX10-NEXT:    s_mov_b32 s8, s19
-; GFX10-NEXT:    s_sub_i32 s26, s16, 64
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX10-NEXT:    s_lshl_b32 s23, s10, 31
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX10-NEXT:    s_sub_i32 s23, s16, 64
 ; GFX10-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
 ; GFX10-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
 ; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
-; GFX10-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX10-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX10-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
 ; GFX10-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX10-NEXT:    s_lshl_b64 s[20:21], s[6:7], s8
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[20:21]
+; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX10-NEXT:    s_lshl_b32 s13, s14, 31
-; GFX10-NEXT:    s_mov_b32 s12, s19
-; GFX10-NEXT:    s_sub_i32 s18, s10, 64
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[12:13]
+; GFX10-NEXT:    s_lshl_b32 s23, s14, 31
 ; GFX10-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
+; GFX10-NEXT:    s_sub_i32 s18, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
@@ -7599,74 +7574,71 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX11-LABEL: s_fshl_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[18:19], 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX11-NEXT:    s_and_not1_b64 s[16:17], s[18:19], s[16:17]
-; GFX11-NEXT:    s_sub_i32 s17, s22, 64
-; GFX11-NEXT:    s_sub_i32 s23, 64, s22
-; GFX11-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX11-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[16:17], 0x7f, s[16:17]
+; GFX11-NEXT:    s_sub_i32 s17, s18, 64
+; GFX11-NEXT:    s_sub_i32 s19, 64, s18
+; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX11-NEXT:    s_mov_b32 s22, 0
+; GFX11-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX11-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s23
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s22
-; GFX11-NEXT:    s_lshl_b64 s[22:23], s[0:1], s22
+; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
+; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
 ; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX11-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX11-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX11-NEXT:    s_lshl_b32 s9, s10, 31
-; GFX11-NEXT:    s_mov_b32 s8, s19
-; GFX11-NEXT:    s_sub_i32 s26, s16, 64
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX11-NEXT:    s_lshl_b32 s23, s10, 31
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX11-NEXT:    s_sub_i32 s23, s16, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
 ; GFX11-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
 ; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
-; GFX11-NEXT:    s_and_not1_b64 s[10:11], s[18:19], s[20:21]
+; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX11-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX11-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
 ; GFX11-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX11-NEXT:    s_lshl_b64 s[20:21], s[6:7], s8
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[20:21]
+; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX11-NEXT:    s_lshl_b32 s13, s14, 31
-; GFX11-NEXT:    s_mov_b32 s12, s19
-; GFX11-NEXT:    s_sub_i32 s18, s10, 64
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[12:13]
+; GFX11-NEXT:    s_lshl_b32 s23, s14, 31
 ; GFX11-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
+; GFX11-NEXT:    s_sub_i32 s18, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 25d845f2f9922ac..88fa7a8406475f3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -5878,39 +5878,38 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: s_fshr_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_movk_i32 s10, 0x7f
-; GFX6-NEXT:    s_mov_b32 s11, 0
-; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX6-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX6-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[10:11]
-; GFX6-NEXT:    s_sub_i32 s13, s8, 64
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX6-NEXT:    s_mov_b32 s1, 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX6-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[14:15], s8
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[14:15], s9
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[14:15], s13
+; GFX6-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
 ; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    s_sub_i32 s14, s12, 64
-; GFX6-NEXT:    s_sub_i32 s13, 64, s12
-; GFX6-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX6-NEXT:    s_sub_i32 s14, s10, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s10
+; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s12
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s12
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s13
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
@@ -5925,39 +5924,38 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX8-LABEL: s_fshr_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s10, 0x7f
-; GFX8-NEXT:    s_mov_b32 s11, 0
-; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX8-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX8-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[10:11]
-; GFX8-NEXT:    s_sub_i32 s13, s8, 64
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX8-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[14:15], s8
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[14:15], s9
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[14:15], s13
+; GFX8-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
 ; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX8-NEXT:    s_sub_i32 s14, s12, 64
-; GFX8-NEXT:    s_sub_i32 s13, 64, s12
-; GFX8-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX8-NEXT:    s_sub_i32 s14, s10, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s10
+; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s12
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s12
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s13
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
 ; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
@@ -5972,39 +5970,38 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX9-LABEL: s_fshr_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s10, 0x7f
-; GFX9-NEXT:    s_mov_b32 s11, 0
-; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX9-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX9-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[10:11]
-; GFX9-NEXT:    s_sub_i32 s13, s8, 64
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX9-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[14:15], s8
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[14:15], s9
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[14:15], s13
+; GFX9-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
 ; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX9-NEXT:    s_sub_i32 s14, s12, 64
-; GFX9-NEXT:    s_sub_i32 s13, 64, s12
-; GFX9-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX9-NEXT:    s_sub_i32 s14, s10, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s12
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s12
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s13
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
 ; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
@@ -6019,94 +6016,92 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX10-LABEL: s_fshr_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_movk_i32 s10, 0x7f
-; GFX10-NEXT:    s_mov_b32 s11, 0
+; GFX10-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX10-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX10-NEXT:    s_lshr_b32 s10, s1, 31
+; GFX10-NEXT:    s_lshr_b32 s12, s1, 31
+; GFX10-NEXT:    s_mov_b32 s13, 0
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
-; GFX10-NEXT:    s_sub_i32 s13, s8, 64
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[12:13]
+; GFX10-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
 ; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s13
+; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s14, s12, 64
-; GFX10-NEXT:    s_sub_i32 s10, 64, s12
-; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX10-NEXT:    s_sub_i32 s14, s10, 64
+; GFX10-NEXT:    s_sub_i32 s11, 64, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s12
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[6:7], s12
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_movk_i32 s10, 0x7f
-; GFX11-NEXT:    s_mov_b32 s11, 0
+; GFX11-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[8:9], 0x7f, s[8:9]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:    s_lshr_b32 s10, s1, 31
+; GFX11-NEXT:    s_lshr_b32 s12, s1, 31
+; GFX11-NEXT:    s_mov_b32 s13, 0
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
-; GFX11-NEXT:    s_sub_i32 s13, s8, 64
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[12:13]
+; GFX11-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
 ; GFX11-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s13
+; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s14, s12, 64
-; GFX11-NEXT:    s_sub_i32 s10, 64, s12
-; GFX11-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX11-NEXT:    s_sub_i32 s14, s10, 64
+; GFX11-NEXT:    s_sub_i32 s11, 64, s10
+; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], s12
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[6:7], s12
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[12:13], 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -6626,45 +6621,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i128_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_movk_i32 s6, 0x7f
-; GFX6-NEXT:    s_mov_b32 s7, 0
-; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[6:7]
-; GFX6-NEXT:    s_sub_i32 s9, s4, 64
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX6-NEXT:    s_mov_b32 s1, 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s7, s4, 64
 ; GFX6-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s4
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[10:11], s5
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
 ; GFX6-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[10:11], s9
+; GFX6-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s0, s8, 64
-; GFX6-NEXT:    s_sub_i32 s1, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s8
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT:    s_sub_i32 s0, s6, 64
+; GFX6-NEXT:    s_sub_i32 s1, 64, s6
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s8
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s6
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s6
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s6
+; GFX6-NEXT:    s_and_b32 s0, 1, s7
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s7
+; GFX6-NEXT:    s_and_b32 s0, 1, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6680,45 +6674,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ;
 ; GFX8-LABEL: v_fshr_i128_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s6, 0x7f
-; GFX8-NEXT:    s_mov_b32 s7, 0
-; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[6:7]
-; GFX8-NEXT:    s_sub_i32 s9, s4, 64
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s7, s4, 64
 ; GFX8-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], s4
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[10:11], s5
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[10:11], s9
+; GFX8-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s0, s8, 64
-; GFX8-NEXT:    s_sub_i32 s1, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT:    s_sub_i32 s0, s6, 64
+; GFX8-NEXT:    s_sub_i32 s1, 64, s6
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s8, v[2:3]
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX8-NEXT:    s_and_b32 s0, 1, s6
+; GFX8-NEXT:    s_and_b32 s0, 1, s7
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s7
+; GFX8-NEXT:    s_and_b32 s0, 1, s8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6734,45 +6727,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ;
 ; GFX9-LABEL: v_fshr_i128_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s6, 0x7f
-; GFX9-NEXT:    s_mov_b32 s7, 0
-; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[6:7]
-; GFX9-NEXT:    s_sub_i32 s9, s4, 64
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s7, s4, 64
 ; GFX9-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], s4
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[10:11], s5
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[10:11], s9
+; GFX9-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s0, s8, 64
-; GFX9-NEXT:    s_sub_i32 s1, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT:    s_sub_i32 s0, s6, 64
+; GFX9-NEXT:    s_sub_i32 s1, 64, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s8, v[2:3]
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX9-NEXT:    s_and_b32 s0, 1, s6
+; GFX9-NEXT:    s_and_b32 s0, 1, s7
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s7
+; GFX9-NEXT:    s_and_b32 s0, 1, s8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6788,45 +6780,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ;
 ; GFX10-LABEL: v_fshr_i128_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_movk_i32 s6, 0x7f
-; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 31
+; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
+; GFX10-NEXT:    s_mov_b32 s9, 0
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-NEXT:    s_sub_i32 s9, s4, 64
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    s_sub_i32 s7, s4, 64
 ; GFX10-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s5
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
 ; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s0, 64, s8
+; GFX10-NEXT:    s_sub_i32 s0, 64, s6
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT:    s_sub_i32 s0, s8, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    s_sub_i32 s0, s6, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s1
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s6
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s8, v[2:3]
+; GFX10-NEXT:    s_and_b32 s0, 1, s7
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
@@ -6842,46 +6833,45 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ;
 ; GFX11-LABEL: v_fshr_i128_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_movk_i32 s6, 0x7f
-; GFX11-NEXT:    s_mov_b32 s7, 0
+; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], s[6:7], s[4:5]
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 31
+; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
+; GFX11-NEXT:    s_mov_b32 s9, 0
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
-; GFX11-NEXT:    s_sub_i32 s9, s4, 64
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX11-NEXT:    s_sub_i32 s7, s4, 64
 ; GFX11-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s5
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
 ; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
 ; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s0, 64, s8
+; GFX11-NEXT:    s_sub_i32 s0, 64, s6
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT:    s_sub_i32 s0, s8, 64
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX11-NEXT:    s_sub_i32 s0, s6, 64
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s1
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s0, 1, s6
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s8, v[2:3]
+; GFX11-NEXT:    s_and_b32 s0, 1, s7
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -6904,42 +6894,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i128_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GFX6-NEXT:    s_sub_i32 s5, s4, 64
-; GFX6-NEXT:    s_sub_i32 s6, 64, s4
+; GFX6-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s6
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s7
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], s4
-; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    s_and_b32 s4, 1, s8
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX6-NEXT:    s_and_b32 s4, 1, s9
-; GFX6-NEXT:    s_sub_i32 s10, s8, 64
-; GFX6-NEXT:    s_sub_i32 s9, 64, s8
+; GFX6-NEXT:    s_sub_i32 s10, s6, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s6
 ; GFX6-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[4:5], s5
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
@@ -6958,42 +6947,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX8-LABEL: v_fshr_i128_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX8-NEXT:    s_sub_i32 s5, s4, 64
-; GFX8-NEXT:    s_sub_i32 s6, 64, s4
+; GFX8-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s6, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    s_and_b32 s4, 1, s8
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_and_b32 s4, 1, s9
-; GFX8-NEXT:    s_sub_i32 s10, s8, 64
-; GFX8-NEXT:    s_sub_i32 s9, 64, s8
+; GFX8-NEXT:    s_sub_i32 s10, s6, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s6
 ; GFX8-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
@@ -7012,42 +7000,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX9-LABEL: v_fshr_i128_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX9-NEXT:    s_sub_i32 s5, s4, 64
-; GFX9-NEXT:    s_sub_i32 s6, 64, s4
+; GFX9-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s6, v[4:5]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    s_and_b32 s4, 1, s8
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX9-NEXT:    s_and_b32 s4, 1, s9
-; GFX9-NEXT:    s_sub_i32 s10, s8, 64
-; GFX9-NEXT:    s_sub_i32 s9, 64, s8
+; GFX9-NEXT:    s_sub_i32 s10, s6, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s6
 ; GFX9-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
@@ -7068,41 +7055,40 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
-; GFX10-NEXT:    s_mov_b64 s[6:7], 0x7f
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX10-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX10-NEXT:    s_sub_i32 s6, 64, s4
 ; GFX10-NEXT:    s_sub_i32 s5, s4, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_and_b32 s4, 1, s7
+; GFX10-NEXT:    s_and_b32 s4, 1, s8
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    s_and_b32 s4, 1, s9
-; GFX10-NEXT:    s_sub_i32 s10, s8, 64
-; GFX10-NEXT:    s_sub_i32 s6, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    s_sub_i32 s10, s6, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
-; GFX10-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
@@ -7112,7 +7098,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -7122,39 +7108,39 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
-; GFX11-NEXT:    s_mov_b64 s[6:7], 0x7f
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], s[6:7], s[4:5]
+; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX11-NEXT:    s_sub_i32 s6, 64, s4
 ; GFX11-NEXT:    s_sub_i32 s5, s4, 64
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
 ; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    s_and_b32 s4, 1, s7
+; GFX11-NEXT:    s_and_b32 s4, 1, s8
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    s_and_b32 s4, 1, s9
-; GFX11-NEXT:    s_sub_i32 s10, s8, 64
-; GFX11-NEXT:    s_sub_i32 s6, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX11-NEXT:    s_sub_i32 s10, s6, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
 ; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
-; GFX11-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
@@ -7163,7 +7149,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -7301,56 +7287,54 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_movk_i32 s18, 0x7f
-; GFX6-NEXT:    s_mov_b32 s19, 0
-; GFX6-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX6-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
+; GFX6-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, s19
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_sub_i32 s23, s16, 64
+; GFX6-NEXT:    s_lshr_b32 s24, s1, 31
+; GFX6-NEXT:    s_mov_b32 s25, 0
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
+; GFX6-NEXT:    s_sub_i32 s19, s16, 64
 ; GFX6-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[24:25], s16
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[24:25], s17
+; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
+; GFX6-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
 ; GFX6-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[24:25], s23
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[24:25]
-; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX6-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX6-NEXT:    s_sub_i32 s26, s22, 64
-; GFX6-NEXT:    s_sub_i32 s24, 64, s22
-; GFX6-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX6-NEXT:    s_sub_i32 s24, s18, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s18
+; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
-; GFX6-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
-; GFX6-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX6-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX6-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX6-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX6-NEXT:    s_lshr_b32 s24, s5, 31
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[18:19]
+; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX6-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX6-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
@@ -7390,56 +7374,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX8-LABEL: s_fshr_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s18, 0x7f
-; GFX8-NEXT:    s_mov_b32 s19, 0
-; GFX8-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX8-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
+; GFX8-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, s19
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_sub_i32 s23, s16, 64
+; GFX8-NEXT:    s_lshr_b32 s24, s1, 31
+; GFX8-NEXT:    s_mov_b32 s25, 0
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
+; GFX8-NEXT:    s_sub_i32 s19, s16, 64
 ; GFX8-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[24:25], s16
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[24:25], s17
+; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
+; GFX8-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
 ; GFX8-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[24:25], s23
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[24:25]
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX8-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX8-NEXT:    s_sub_i32 s26, s22, 64
-; GFX8-NEXT:    s_sub_i32 s24, 64, s22
-; GFX8-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX8-NEXT:    s_sub_i32 s24, s18, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s18
+; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
-; GFX8-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
-; GFX8-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX8-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX8-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX8-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX8-NEXT:    s_lshr_b32 s24, s5, 31
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[18:19]
+; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX8-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX8-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
@@ -7479,56 +7461,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX9-LABEL: s_fshr_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s18, 0x7f
-; GFX9-NEXT:    s_mov_b32 s19, 0
-; GFX9-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX9-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
+; GFX9-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, s19
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_sub_i32 s23, s16, 64
+; GFX9-NEXT:    s_lshr_b32 s24, s1, 31
+; GFX9-NEXT:    s_mov_b32 s25, 0
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
+; GFX9-NEXT:    s_sub_i32 s19, s16, 64
 ; GFX9-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[24:25], s16
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[24:25], s17
+; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
+; GFX9-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
 ; GFX9-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[24:25], s23
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[24:25]
-; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX9-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX9-NEXT:    s_sub_i32 s26, s22, 64
-; GFX9-NEXT:    s_sub_i32 s24, 64, s22
-; GFX9-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX9-NEXT:    s_sub_i32 s24, s18, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s18
+; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
-; GFX9-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
-; GFX9-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX9-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX9-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX9-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s5, 31
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[18:19]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX9-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX9-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
@@ -7568,56 +7548,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_movk_i32 s18, 0x7f
-; GFX10-NEXT:    s_mov_b32 s19, 0
+; GFX10-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX10-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX10-NEXT:    s_lshr_b32 s24, s1, 31
-; GFX10-NEXT:    s_mov_b32 s25, s19
+; GFX10-NEXT:    s_lshr_b32 s22, s1, 31
+; GFX10-NEXT:    s_mov_b32 s23, 0
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[24:25]
-; GFX10-NEXT:    s_sub_i32 s23, s16, 64
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[22:23]
+; GFX10-NEXT:    s_sub_i32 s19, s16, 64
 ; GFX10-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
 ; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
 ; GFX10-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
 ; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s23
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
+; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX10-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s26, s22, 64
-; GFX10-NEXT:    s_sub_i32 s23, 64, s22
-; GFX10-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX10-NEXT:    s_sub_i32 s22, s18, 64
+; GFX10-NEXT:    s_sub_i32 s19, 64, s18
+; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s22
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[10:11], s23
-; GFX10-NEXT:    s_lshr_b64 s[22:23], s[10:11], s22
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
+; GFX10-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
-; GFX10-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
+; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX10-NEXT:    s_lshr_b32 s22, s5, 31
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
 ; GFX10-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
@@ -7657,56 +7635,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX11-LABEL: s_fshr_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_movk_i32 s18, 0x7f
-; GFX11-NEXT:    s_mov_b32 s19, 0
+; GFX11-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[16:17], 0x7f, s[16:17]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX11-NEXT:    s_and_not1_b64 s[16:17], s[18:19], s[16:17]
-; GFX11-NEXT:    s_lshr_b32 s24, s1, 31
-; GFX11-NEXT:    s_mov_b32 s25, s19
+; GFX11-NEXT:    s_lshr_b32 s22, s1, 31
+; GFX11-NEXT:    s_mov_b32 s23, 0
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[24:25]
-; GFX11-NEXT:    s_sub_i32 s23, s16, 64
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[22:23]
+; GFX11-NEXT:    s_sub_i32 s19, s16, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
 ; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
 ; GFX11-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
 ; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s23
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
+; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX11-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s26, s22, 64
-; GFX11-NEXT:    s_sub_i32 s23, 64, s22
-; GFX11-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX11-NEXT:    s_sub_i32 s22, s18, 64
+; GFX11-NEXT:    s_sub_i32 s19, 64, s18
+; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s22
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[10:11], s23
-; GFX11-NEXT:    s_lshr_b64 s[22:23], s[10:11], s22
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
+; GFX11-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
-; GFX11-NEXT:    s_and_not1_b64 s[10:11], s[18:19], s[20:21]
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
+; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX11-NEXT:    s_lshr_b32 s22, s5, 31
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
 ; GFX11-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index d6957be8ab8ffbb..64c3cd4e8c067f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -146,6 +146,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
@@ -515,6 +516,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -608,6 +610,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
@@ -701,6 +704,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -866,6 +870,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
@@ -1406,6 +1411,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_not_b32_e32 v2, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -1529,6 +1535,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
@@ -1653,6 +1660,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_not_b32_e32 v3, v3
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -1976,6 +1984,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    s_not_b32 s6, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -2693,6 +2702,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
@@ -2846,6 +2856,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    s_not_b32 s5, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
@@ -3001,6 +3012,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
@@ -3351,6 +3363,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_movrels_b32_e32 v0, v2
@@ -4289,11 +4302,11 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ;
 ; GFX7-LABEL: insertelement_v_v16i16_s_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s18, 0
-; GFX7-NEXT:    s_mov_b32 s19, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[16:17], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[12:13], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
 ; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
@@ -4309,7 +4322,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
-; GFX7-NEXT:    s_mov_b32 s18, -1
+; GFX7-NEXT:    s_mov_b32 s14, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
@@ -4325,13 +4338,14 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[12:13], 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
-; GFX7-NEXT:    s_mov_b64 s[16:17], 16
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v16i16_s_v:
@@ -4523,6 +4537,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_movrels_b32_e32 v1, v3
@@ -4686,11 +4701,11 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ;
 ; GFX7-LABEL: insertelement_v_v16i16_v_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s18, 0
-; GFX7-NEXT:    s_mov_b32 s19, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[16:17], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[12:13], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[12:15], 0 addr64 offset:16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
@@ -4706,7 +4721,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
-; GFX7-NEXT:    s_mov_b32 s18, -1
+; GFX7-NEXT:    s_mov_b32 s14, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
@@ -4722,13 +4737,14 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[12:13], 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v12, s[6:7]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
-; GFX7-NEXT:    s_mov_b64 s[16:17], 16
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v16i16_v_v:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index d531462ae9cc965..16b702edff2db94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -159,6 +159,7 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
@@ -581,6 +582,7 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
@@ -681,6 +683,7 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
@@ -781,6 +784,7 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
@@ -943,6 +947,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
@@ -1312,6 +1317,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -1405,6 +1411,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
@@ -1498,6 +1505,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -1741,6 +1749,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
@@ -2281,6 +2290,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_not_b32_e32 v2, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -2404,6 +2414,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
@@ -2528,6 +2539,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_not_b32_e32 v3, v3
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -2851,6 +2863,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    s_not_b32 s6, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -3568,6 +3581,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
@@ -3721,6 +3735,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    s_not_b32 s5, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
@@ -3876,6 +3891,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 61439021a88757a..dc9cbb498dab4d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -687,17 +687,17 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_mov_b32 s18, 0
+; GPRIDX-NEXT:    s_mov_b32 s16, 0
+; GPRIDX-NEXT:    s_mov_b32 s14, 0
+; GPRIDX-NEXT:    s_mov_b32 s12, 0
+; GPRIDX-NEXT:    s_mov_b32 s8, 0
 ; GPRIDX-NEXT:    s_mov_b64 s[4:5], 1.0
 ; GPRIDX-NEXT:    s_mov_b32 s19, 0x40200000
 ; GPRIDX-NEXT:    s_mov_b32 s17, 0x401c0000
-; GPRIDX-NEXT:    s_mov_b32 s16, s18
 ; GPRIDX-NEXT:    s_mov_b32 s15, 0x40180000
-; GPRIDX-NEXT:    s_mov_b32 s14, s18
 ; GPRIDX-NEXT:    s_mov_b32 s13, 0x40140000
-; GPRIDX-NEXT:    s_mov_b32 s12, s18
 ; GPRIDX-NEXT:    s_mov_b64 s[10:11], 4.0
 ; GPRIDX-NEXT:    s_mov_b32 s9, 0x40080000
-; GPRIDX-NEXT:    s_mov_b32 s8, s18
 ; GPRIDX-NEXT:    s_mov_b64 s[6:7], 2.0
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s4
 ; GPRIDX-NEXT:    v_mov_b32_e32 v4, s5
@@ -753,17 +753,17 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s18, 0
+; GFX10-NEXT:    s_mov_b32 s16, 0
+; GFX10-NEXT:    s_mov_b32 s14, 0
+; GFX10-NEXT:    s_mov_b32 s12, 0
+; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 1.0
 ; GFX10-NEXT:    s_mov_b32 s19, 0x40200000
 ; GFX10-NEXT:    s_mov_b32 s17, 0x401c0000
-; GFX10-NEXT:    s_mov_b32 s16, s18
 ; GFX10-NEXT:    s_mov_b32 s15, 0x40180000
-; GFX10-NEXT:    s_mov_b32 s14, s18
 ; GFX10-NEXT:    s_mov_b32 s13, 0x40140000
-; GFX10-NEXT:    s_mov_b32 s12, s18
 ; GFX10-NEXT:    s_mov_b64 s[10:11], 4.0
 ; GFX10-NEXT:    s_mov_b32 s9, 0x40080000
-; GFX10-NEXT:    s_mov_b32 s8, s18
 ; GFX10-NEXT:    s_mov_b64 s[6:7], 2.0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s5
@@ -820,16 +820,16 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s14, 0
 ; GFX11-NEXT:    s_mov_b32 s15, 0x40200000
+; GFX11-NEXT:    s_mov_b32 s12, 0
+; GFX11-NEXT:    s_mov_b32 s10, 0
+; GFX11-NEXT:    s_mov_b32 s8, 0
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 1.0
 ; GFX11-NEXT:    s_mov_b32 s13, 0x401c0000
-; GFX11-NEXT:    s_mov_b32 s12, s14
 ; GFX11-NEXT:    s_mov_b32 s11, 0x40180000
-; GFX11-NEXT:    s_mov_b32 s10, s14
 ; GFX11-NEXT:    s_mov_b32 s9, 0x40140000
-; GFX11-NEXT:    s_mov_b32 s8, s14
 ; GFX11-NEXT:    s_mov_b64 s[6:7], 4.0
 ; GFX11-NEXT:    s_mov_b32 s5, 0x40080000
-; GFX11-NEXT:    s_mov_b32 s4, s14
 ; GFX11-NEXT:    s_mov_b64 s[2:3], 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
index 2714982163fec85..f1c3673ae29dd3c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
@@ -23,6 +23,7 @@ body:             |
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -32,6 +33,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -41,6 +43,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -75,18 +78,17 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -96,6 +98,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -103,18 +106,17 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -153,6 +155,7 @@ body:             |
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -162,6 +165,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -171,6 +175,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -205,18 +210,17 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -226,6 +230,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -233,18 +238,17 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -281,18 +285,17 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -300,18 +303,17 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -319,18 +321,17 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -338,17 +339,15 @@ body:             |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -378,6 +377,7 @@ body:             |
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX7-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -386,6 +386,7 @@ body:             |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -394,6 +395,7 @@ body:             |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -427,6 +429,7 @@ body:             |
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX7-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -435,6 +438,7 @@ body:             |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX9-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -443,6 +447,7 @@ body:             |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX10-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
index bc131f53910d9b6..e1ef96bec0fdbad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
@@ -144,17 +144,15 @@ body:             |
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4
@@ -164,17 +162,15 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4
@@ -344,17 +340,15 @@ body:             |
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ;
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4
@@ -364,17 +358,15 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4
@@ -423,22 +415,20 @@ body:             |
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE3]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE4]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN]].sub0
     ; GFX6-NEXT: $vgpr0 = COPY [[COPY7]]
     ;
@@ -449,22 +439,20 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE3]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE4]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN]].sub0
     ; GFX7-NEXT: $vgpr0 = COPY [[COPY7]]
     ;
@@ -475,17 +463,15 @@ body:             |
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4
@@ -495,17 +481,15 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4
@@ -834,19 +818,17 @@ body:             |
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX7-FLAT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-    ; GFX7-FLAT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4095
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY3]], [[COPY4]], implicit-def $scc
     ; GFX7-FLAT-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY5]], [[COPY6]], implicit-def dead $scc, implicit $scc
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095
@@ -855,19 +837,17 @@ body:             |
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-    ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4095
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY3]], [[COPY4]], implicit-def $scc
     ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY5]], [[COPY6]], implicit-def dead $scc, implicit $scc
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
index 78071022fc05c26..868f08d805caa4a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
@@ -21,6 +21,7 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -28,6 +29,7 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -35,6 +37,7 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -64,18 +67,21 @@ body:             |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -102,18 +108,17 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -121,23 +126,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2047
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -168,39 +173,38 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -229,18 +233,17 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -248,23 +251,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2048
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -295,39 +298,38 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -356,18 +358,17 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -375,23 +376,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4095
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -422,39 +423,38 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -483,68 +483,63 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4097
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4097
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4097
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -569,65 +564,60 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4097
@@ -652,6 +642,7 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -659,6 +650,7 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -666,6 +658,7 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s64
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -695,18 +688,21 @@ body:             |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s64_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -733,18 +729,17 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -752,23 +747,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s64_offset4095
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -799,39 +794,38 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
index d3eaee283d64475..43d9b911b9f3358 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
@@ -27,6 +27,7 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -34,6 +35,7 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -41,6 +43,7 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -48,6 +51,7 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -82,24 +86,28 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -133,23 +141,23 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2047
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -157,6 +165,7 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -164,6 +173,7 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2047
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -200,34 +210,36 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -263,23 +275,23 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2048
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2048
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -287,23 +299,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2048
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -340,44 +352,44 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -413,23 +425,23 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4095
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4095
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -437,23 +449,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4095
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -490,44 +502,44 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -564,73 +576,69 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4097
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4097
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4097
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -662,70 +670,66 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4097
@@ -755,6 +759,7 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
@@ -762,6 +767,7 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -769,6 +775,7 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -776,6 +783,7 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s64
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -810,24 +818,28 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_X2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s64_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s64_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s64_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s64_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -861,23 +873,23 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s64_offset4095
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s64_offset4095
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -885,23 +897,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s64_offset4095
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -938,44 +950,44 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_X2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
index dadc6f32dfb8cf2..ca3fd71f6c98146 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
@@ -136,58 +136,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_v_s64
-    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_v_s64
-    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:vgpr(s64) = G_CONSTANT i64 0
     %1:vgpr(s64) = G_CONSTANT i64 1
     %2:vgpr(s64) = G_CONSTANT i64 -1
@@ -208,42 +184,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_s_s64
-    ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_s_s64
-    ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:sgpr(s64) = G_CONSTANT i64 0
     %1:sgpr(s64) = G_CONSTANT i64 1
     %2:sgpr(s64) = G_CONSTANT i64 -1
@@ -351,42 +319,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_s_p1
-    ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_s_p1
-    ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:sgpr(p1) = G_CONSTANT i64 0
     %1:sgpr(p1) = G_CONSTANT i64 1
     %2:sgpr(p1) = G_CONSTANT i64 -1
@@ -407,58 +367,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_v_p1
-    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_v_p1
-    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:vgpr(p1) = G_CONSTANT i64 0
     %1:vgpr(p1) = G_CONSTANT i64 1
     %2:vgpr(p1) = G_CONSTANT i64 -1
@@ -479,42 +415,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_s_p999
-    ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_s_p999
-    ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:sgpr(p999) = G_CONSTANT i64 0
     %1:sgpr(p999) = G_CONSTANT i64 1
     %2:sgpr(p999) = G_CONSTANT i64 -1
@@ -535,58 +463,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_v_p999
-    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_v_p999
-    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:vgpr(p999) = G_CONSTANT i64 0
     %1:vgpr(p999) = G_CONSTANT i64 1
     %2:vgpr(p999) = G_CONSTANT i64 -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
index 23b10218cbbe893..2465c374cc11d0f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
@@ -61,21 +61,13 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GCN-LABEL: name: fconstant_v_s64
-    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1072693248, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1075838976, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; GCN-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1073741824, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; GCN-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1076101120, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[REG_SEQUENCE]]
-    ; GCN-NEXT: $vgpr2_vgpr3 = COPY [[REG_SEQUENCE1]]
-    ; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4620693217682128896, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4611686018427387904, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4621819117588971520, implicit $exec
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MOV_B]]
+    ; GCN-NEXT: $vgpr2_vgpr3 = COPY [[V_MOV_B1]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MOV_B2]], implicit [[V_MOV_B3]]
     %0:vgpr(s64) = G_FCONSTANT double 1.0
     %1:vgpr(s64) = G_FCONSTANT double 8.0
     %2:vgpr(s64) = G_FCONSTANT double -2.0
@@ -95,17 +87,13 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GCN-LABEL: name: fconstant_s_s64
-    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4607182418800017408
-    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1075838976
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GCN-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -4611686018427387904
-    ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1071382528
-    ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_MOV_B64_]]
-    ; GCN-NEXT: $sgpr2_sgpr3 = COPY [[REG_SEQUENCE]]
-    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_1]], implicit [[REG_SEQUENCE1]]
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4607182418800017408
+    ; GCN-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4620693217682128896
+    ; GCN-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4611686018427387904
+    ; GCN-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4601552919265804288
+    ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_MOV_B]]
+    ; GCN-NEXT: $sgpr2_sgpr3 = COPY [[S_MOV_B1]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]]
     %0:sgpr(s64) = G_FCONSTANT double 1.0
     %1:sgpr(s64) = G_FCONSTANT double 8.0
     %2:sgpr(s64) = G_FCONSTANT double -2.0
@@ -136,7 +124,6 @@ body: |
     %2:vgpr(s32) = G_ANYEXT %0
     %3:vgpr(s32) = G_ANYEXT %1
 
-    ; Test without already assigned register class
     %4:vgpr(s16) = G_FCONSTANT half 1.0
     %5:vgpr(s16) = G_FCONSTANT half 8.0
     $vgpr0 = COPY %2
@@ -168,7 +155,6 @@ body: |
     %2:vgpr(s32) = G_ANYEXT %0
     %3:vgpr(s32) = G_ANYEXT %1
 
-    ; Test without already assigned register class
     %4:sgpr(s16) = G_FCONSTANT half 1.0
     %5:sgpr(s16) = G_FCONSTANT half 8.0
     $sgpr0 = COPY %2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
index 15ece434487ed16..6ccd2a9c3e67859 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
@@ -17,24 +17,21 @@ body: |
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
-    ; GCN-NEXT: %4:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %5:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %6:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], %4, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], %5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], %6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(s32) = COPY $vgpr1
     %3:vgpr(p1) = COPY $vgpr3_vgpr4
 
-    ; fmul vs
     %4:vgpr(s32) = G_FMUL %1, %0
 
-    ; fmul sv
     %5:vgpr(s32) = G_FMUL %0, %1
 
-    ; fmul vv
     %6:vgpr(s32) = G_FMUL %1, %2
 
     G_STORE %4, %3 :: (store (s32), addrspace 1)
@@ -57,22 +54,19 @@ body: |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GCN-NEXT: %4:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %5:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %6:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0, implicit %4, implicit %5, implicit %6
+    ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F64_e64_2:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MUL_F64_e64_]], implicit [[V_MUL_F64_e64_1]], implicit [[V_MUL_F64_e64_2]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:vgpr(s64) = COPY $vgpr0_vgpr1
     %2:vgpr(s64) = COPY $vgpr2_vgpr3
     %3:vgpr(p1) = COPY $vgpr4_vgpr5
 
-    ; fmul vs
     %4:vgpr(s64) = G_FMUL %1, %0
 
-    ; fmul sv
     %5:vgpr(s64) = G_FMUL %0, %1
 
-    ; fmul vv
     %6:vgpr(s64) = G_FMUL %1, %2
     S_ENDPGM 0, implicit %4, implicit %5, implicit %6
 
@@ -92,10 +86,10 @@ body: |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN-NEXT: %7:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %8:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %9:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0, implicit %7, implicit %8, implicit %9
+    ; GCN-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MUL_F16_e64_]], implicit [[V_MUL_F16_e64_1]], implicit [[V_MUL_F16_e64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(s32) = COPY $vgpr1
@@ -105,13 +99,10 @@ body: |
     %5:vgpr(s16) = G_TRUNC %1
     %6:vgpr(s16) = G_TRUNC %2
 
-    ; fmul vs
     %8:vgpr(s16) = G_FMUL %4, %4
 
-    ; fmul sv
     %9:vgpr(s16) = G_FMUL %4, %4
 
-    ; fmul vv
     %10:vgpr(s16) = G_FMUL %4, %5
 
     S_ENDPGM 0, implicit %8, implicit %9, implicit %10
@@ -131,26 +122,26 @@ body: |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GCN-NEXT: %6:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %7:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %8:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %9:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %10:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %11:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %12:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %14:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %15:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %7, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %9, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %11, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %12, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %14, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %15, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_7:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_8:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_9:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_5]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_6]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_7]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_8]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(p1) = COPY $vgpr2_vgpr3
@@ -159,37 +150,27 @@ body: |
     %4:vgpr(s32) = G_FNEG %0
     %5:vgpr(s32) = G_FNEG %3
 
-    ; fabs lhs
     %6:vgpr(s32) = G_FMUL %3, %0
 
-    ; fabs rhs
     %7:vgpr(s32) = G_FMUL %0, %3
 
-    ; fabs lhs, rhs
     %8:vgpr(s32) = G_FMUL %3, %3
 
 
-    ; fneg lhs
     %9:vgpr(s32) = G_FMUL %4, %0
 
-    ; fneg rhs
     %10:vgpr(s32) = G_FMUL %0, %4
 
-    ; fneg  lhs, rhs
     %11:vgpr(s32) = G_FMUL %4, %4
 
 
-    ; fneg fabs lhs
     %12:vgpr(s32) = G_FMUL %5, %0
 
-    ; fneg fabs rhs
     %13:vgpr(s32) = G_FMUL %0, %5
 
-    ; fneg  fabs lhs, rhs
     %14:vgpr(s32) = G_FMUL %5, %5
 
 
-    ; fneg  fabs lhs, fneg rhs
     %15:vgpr(s32) = G_FMUL %5, %4
 
     G_STORE %6, %2 :: (store (s32), addrspace 1)
@@ -369,10 +350,8 @@ body: |
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1070596096, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 3, [[COPY]], 1, [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4598175219545276416, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 3, [[COPY]], 1, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MUL_F64_e64_]]
     ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index cda4414a0d90eea..47e5a2f35a56790 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -20,10 +20,8 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
     ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
-    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
     ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
@@ -62,7 +60,6 @@ body: |
   bb.1:
     liveins: $sgpr0_sgpr1
 
-    ; S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
     ; CHECK-LABEL: name: fract_f64_neg_abs
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
@@ -71,10 +68,8 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
     ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
-    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
     ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
index c9b1b782658c7f2..de21788ff168f71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
@@ -21,18 +21,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_s32_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -62,18 +65,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>))
     ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_v2s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>))
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_v2s16_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>))
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_v2s16_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -103,18 +109,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3))
     ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_p3_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3))
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_p3_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3))
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_p3_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -144,18 +153,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_s64_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_s64_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_s64_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -185,18 +197,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -226,18 +241,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_v4s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_v4s16_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_v4s16_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -267,18 +285,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_p1_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_p1_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_p1_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -308,18 +329,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_p0_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_p0_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_p0_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -347,65 +371,60 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
@@ -430,40 +449,39 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
index 2c2e792bce66dd1..b678966de85377b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
@@ -28,6 +28,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -39,18 +40,21 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -80,24 +84,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_v2s16_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s16_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_v2s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_v2s16_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -127,24 +135,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_p3_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_p3_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_p3_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_p3_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -179,6 +191,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -190,18 +203,21 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -231,24 +247,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -278,24 +298,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_v4s16_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_v4s16_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_v4s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_v4s16_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -325,24 +349,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_p1_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_p1_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_p1_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_p1_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -372,24 +400,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_p0_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_p0_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_p0_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_p0_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -417,66 +449,64 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], -2048, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -513,6 +543,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -524,43 +555,42 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4095
@@ -585,66 +615,64 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], -2048, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index 8f17cc3ab47ec18..a53fd81f351a2d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -1189,11 +1189,11 @@ body: |
     ; GFX6: liveins: $sgpr0_sgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; GFX6-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX6-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX6-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1204,11 +1204,11 @@ body: |
     ; GFX7: liveins: $sgpr0_sgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; GFX7-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX7-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX7-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1219,11 +1219,11 @@ body: |
     ; GFX8: liveins: $sgpr0_sgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1259,51 +1259,45 @@ body: |
     ; GFX6: liveins: $sgpr0_sgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX6-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX6-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX6-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX6-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX6-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     ;
     ; GFX7-LABEL: name: load_constant_s32_from_4_gep_negative_524288
     ; GFX7: liveins: $sgpr0_sgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX7-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX7-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX7-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX7-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX7-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     ;
     ; GFX8-LABEL: name: load_constant_s32_from_4_gep_negative_524288
     ; GFX8: liveins: $sgpr0_sgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008
-    ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX8-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX8-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX8-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     ;
     ; GFX10-LABEL: name: load_constant_s32_from_4_gep_negative_524288
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index 78812ca1991f91e..d7c325439886263 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -23,24 +23,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_4
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -70,24 +74,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_2
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_2
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_2
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -117,24 +125,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -164,24 +176,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v2s32
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -211,24 +227,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v3s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v3s32
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -258,24 +278,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v4s32
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -305,24 +329,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_flat_s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_flat_s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_flat_s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_flat_s64
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -352,24 +380,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v2s64
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -399,24 +431,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX8-LABEL: name: load_flat_v2p1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: load_flat_v2p1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX10-LABEL: name: load_flat_v2p1
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX11-LABEL: name: load_flat_v2p1
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -446,24 +482,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
+    ;
     ; GFX8-LABEL: name: load_flat_s96
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
+    ;
     ; GFX9-LABEL: name: load_flat_s96
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
+    ;
     ; GFX10-LABEL: name: load_flat_s96
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
+    ;
     ; GFX11-LABEL: name: load_flat_s96
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -493,24 +533,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX8-LABEL: name: load_flat_s128
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX9-LABEL: name: load_flat_s128
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX10-LABEL: name: load_flat_s128
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX11-LABEL: name: load_flat_s128
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -540,24 +584,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_flat_p3_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_flat_p3_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_flat_p3_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_flat_p3_from_4
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -587,24 +635,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_flat_p1_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_flat_p1_from_8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_flat_p1_from_8
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_flat_p1_from_8
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -634,24 +686,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX8-LABEL: name: load_flat_p999_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX9-LABEL: name: load_flat_p999_from_8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX10-LABEL: name: load_flat_p999_from_8
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX11-LABEL: name: load_flat_p999_from_8
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -681,24 +737,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX8-LABEL: name: load_flat_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: load_flat_v2p3
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX10-LABEL: name: load_flat_v2p3
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX11-LABEL: name: load_flat_v2p3
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -728,24 +788,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_flat_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_flat_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_flat_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_flat_v2s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -775,24 +839,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v4s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -822,24 +890,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
+    ;
     ; GFX8-LABEL: name: load_flat_v6s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: load_flat_v6s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
+    ;
     ; GFX10-LABEL: name: load_flat_v6s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
+    ;
     ; GFX11-LABEL: name: load_flat_v6s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -869,24 +941,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v8s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v8s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v8s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v8s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -918,56 +994,54 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -997,56 +1071,54 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1076,81 +1148,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2047
@@ -1175,81 +1241,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
@@ -1274,56 +1334,54 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1353,81 +1411,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4096
@@ -1452,81 +1504,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4095
@@ -1551,81 +1597,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4096
@@ -1650,81 +1690,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8191
@@ -1749,81 +1783,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8192
@@ -1848,81 +1876,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8191
@@ -1947,81 +1969,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8192
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
index 7ffa65f922456db..0103bfc9d39c14e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
@@ -243,17 +243,15 @@ body: |
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset4095
@@ -310,17 +308,15 @@ body: |
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset_neg4096
@@ -440,35 +436,31 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199
-    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4097
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4097
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -476,17 +468,15 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199
-    ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4097
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
@@ -562,17 +552,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294965247
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -2049
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
index f26e23293dae798..27806edc9180888 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
@@ -1212,34 +1212,30 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_2047
@@ -1309,34 +1305,30 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_2048
@@ -1350,17 +1342,15 @@ body: |
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_2048
@@ -1392,78 +1382,70 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2047
@@ -1509,78 +1491,70 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2048
@@ -1650,34 +1624,30 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_4095
@@ -1691,17 +1661,15 @@ body: |
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_4095
@@ -1759,85 +1727,75 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4096
@@ -1862,78 +1820,70 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4095
@@ -1947,17 +1897,15 @@ body: |
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4095
@@ -1989,78 +1937,70 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4096
@@ -2074,17 +2014,15 @@ body: |
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4096
@@ -2142,85 +2080,75 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8191
@@ -2271,85 +2199,75 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8192
@@ -2374,129 +2292,115 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8191
@@ -2521,129 +2425,115 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8192
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
index edace19c47b1619..ad53a2fd8112029 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -193,7 +193,7 @@ body: |
 
 # Test a load of an offset from a constant base address
 # GCN-LABEL: name: constant_address_positive{{$}}
-# GCN: %0:sreg_64 = S_MOV_B64 44
+# GCN: %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 44
 
 # VI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 64, 0 :: (dereferenceable invariant load (s32), addrspace 4)
 # SICI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 16, 0 :: (dereferenceable invariant load (s32), addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
index 1a20e55958742e1..3f020aaa0365c73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
@@ -314,8 +314,8 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B64_]], implicit-def dead $scc
+    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B]], implicit-def dead $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
     %0:sgpr(p0) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 0
@@ -441,7 +441,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -2
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -468,7 +468,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -4
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -495,7 +495,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -8
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -522,7 +522,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -16
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -16
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -549,9 +549,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3758096384
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; CHECK-NEXT: %const:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -536870912
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -776,9 +774,7 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967294, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -2, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0
@@ -805,9 +801,7 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0
@@ -834,9 +828,7 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0
@@ -863,9 +855,7 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967280, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -16, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0
@@ -892,9 +882,7 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3758096384, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -536870912, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index c7520de936aa847..fc6925ee5709c5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -22,24 +22,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX8-LABEL: name: store_flat_s32_to_4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX9-LABEL: name: store_flat_s32_to_4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX10-LABEL: name: store_flat_s32_to_4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX11-LABEL: name: store_flat_s32_to_4
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -68,24 +72,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16))
+    ;
     ; GFX8-LABEL: name: store_flat_s32_to_2
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16))
+    ;
     ; GFX9-LABEL: name: store_flat_s32_to_2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16))
+    ;
     ; GFX10-LABEL: name: store_flat_s32_to_2
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16))
+    ;
     ; GFX11-LABEL: name: store_flat_s32_to_2
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -114,24 +122,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8))
+    ;
     ; GFX8-LABEL: name: store_flat_s32_to_1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8))
+    ;
     ; GFX9-LABEL: name: store_flat_s32_to_1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8))
+    ;
     ; GFX10-LABEL: name: store_flat_s32_to_1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8))
+    ;
     ; GFX11-LABEL: name: store_flat_s32_to_1
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -161,24 +173,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
+    ;
     ; GFX8-LABEL: name: store_flat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
+    ;
     ; GFX9-LABEL: name: store_flat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
+    ;
     ; GFX10-LABEL: name: store_flat_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
+    ;
     ; GFX11-LABEL: name: store_flat_s64
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -207,24 +223,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX7-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16)
+    ;
     ; GFX8-LABEL: name: store_flat_s96
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16)
+    ;
     ; GFX9-LABEL: name: store_flat_s96
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16)
+    ;
     ; GFX10-LABEL: name: store_flat_s96
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16)
+    ;
     ; GFX11-LABEL: name: store_flat_s96
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX11-NEXT: {{  $}}
@@ -253,24 +273,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128))
+    ;
     ; GFX8-LABEL: name: store_flat_s128
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128))
+    ;
     ; GFX9-LABEL: name: store_flat_s128
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128))
+    ;
     ; GFX10-LABEL: name: store_flat_s128
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128))
+    ;
     ; GFX11-LABEL: name: store_flat_s128
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -300,24 +324,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>))
+    ;
     ; GFX8-LABEL: name: store_flat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>))
+    ;
     ; GFX9-LABEL: name: store_flat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>))
+    ;
     ; GFX10-LABEL: name: store_flat_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>))
+    ;
     ; GFX11-LABEL: name: store_flat_v2s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -346,24 +374,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
     ; GFX7-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16)
+    ;
     ; GFX8-LABEL: name: store_flat_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16)
+    ;
     ; GFX9-LABEL: name: store_flat_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16)
+    ;
     ; GFX10-LABEL: name: store_flat_v3s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16)
+    ;
     ; GFX11-LABEL: name: store_flat_v3s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX11-NEXT: {{  $}}
@@ -392,24 +424,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>))
+    ;
     ; GFX8-LABEL: name: store_flat_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>))
+    ;
     ; GFX9-LABEL: name: store_flat_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>))
+    ;
     ; GFX10-LABEL: name: store_flat_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>))
+    ;
     ; GFX11-LABEL: name: store_flat_v4s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -439,24 +475,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>))
+    ;
     ; GFX8-LABEL: name: store_flat_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>))
+    ;
     ; GFX9-LABEL: name: store_flat_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>))
+    ;
     ; GFX10-LABEL: name: store_flat_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>))
+    ;
     ; GFX11-LABEL: name: store_flat_v2s16
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -486,24 +526,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>))
+    ;
     ; GFX8-LABEL: name: store_flat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>))
+    ;
     ; GFX9-LABEL: name: store_flat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>))
+    ;
     ; GFX10-LABEL: name: store_flat_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>))
+    ;
     ; GFX11-LABEL: name: store_flat_v4s16
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -533,24 +577,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX7-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16)
+    ;
     ; GFX8-LABEL: name: store_flat_v6s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16)
+    ;
     ; GFX9-LABEL: name: store_flat_v6s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16)
+    ;
     ; GFX10-LABEL: name: store_flat_v6s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16)
+    ;
     ; GFX11-LABEL: name: store_flat_v6s16
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX11-NEXT: {{  $}}
@@ -579,24 +627,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
+    ;
     ; GFX8-LABEL: name: store_flat_v8s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
+    ;
     ; GFX9-LABEL: name: store_flat_v8s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
+    ;
     ; GFX10-LABEL: name: store_flat_v8s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
+    ;
     ; GFX11-LABEL: name: store_flat_v8s16
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -626,24 +678,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>))
+    ;
     ; GFX8-LABEL: name: store_flat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>))
+    ;
     ; GFX9-LABEL: name: store_flat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>))
+    ;
     ; GFX10-LABEL: name: store_flat_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>))
+    ;
     ; GFX11-LABEL: name: store_flat_v2s64
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -673,24 +729,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1))
+    ;
     ; GFX8-LABEL: name: store_flat_p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1))
+    ;
     ; GFX9-LABEL: name: store_flat_p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1))
+    ;
     ; GFX10-LABEL: name: store_flat_p1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1))
+    ;
     ; GFX11-LABEL: name: store_flat_p1
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -720,24 +780,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>))
+    ;
     ; GFX8-LABEL: name: store_flat_v2p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>))
+    ;
     ; GFX9-LABEL: name: store_flat_v2p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>))
+    ;
     ; GFX10-LABEL: name: store_flat_v2p1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>))
+    ;
     ; GFX11-LABEL: name: store_flat_v2p1
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -767,24 +831,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3))
+    ;
     ; GFX8-LABEL: name: store_flat_p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3))
+    ;
     ; GFX9-LABEL: name: store_flat_p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3))
+    ;
     ; GFX10-LABEL: name: store_flat_p3
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3))
+    ;
     ; GFX11-LABEL: name: store_flat_p3
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -814,24 +882,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>))
+    ;
     ; GFX8-LABEL: name: store_flat_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>))
+    ;
     ; GFX9-LABEL: name: store_flat_v2p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>))
+    ;
     ; GFX10-LABEL: name: store_flat_v2p3
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>))
+    ;
     ; GFX11-LABEL: name: store_flat_v2p3
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -860,24 +932,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32))
+    ;
     ; GFX8-LABEL: name: store_atomic_flat_s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32))
+    ;
     ; GFX9-LABEL: name: store_atomic_flat_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32))
+    ;
     ; GFX10-LABEL: name: store_atomic_flat_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32))
+    ;
     ; GFX11-LABEL: name: store_atomic_flat_s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -907,24 +983,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64))
+    ;
     ; GFX8-LABEL: name: store_atomic_flat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64))
+    ;
     ; GFX9-LABEL: name: store_atomic_flat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64))
+    ;
     ; GFX10-LABEL: name: store_atomic_flat_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64))
+    ;
     ; GFX11-LABEL: name: store_atomic_flat_s64
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -953,55 +1033,53 @@ body: |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX8-LABEL: name: store_flat_s32_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX9-LABEL: name: store_flat_s32_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX10-LABEL: name: store_flat_s32_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX11-LABEL: name: store_flat_s32_gep_2047
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
index c56ba70b667d96d..95e01aa413a3747 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
@@ -1168,34 +1168,30 @@ body: |
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
     ;
     ; GFX8-LABEL: name: store_global_s32_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
     ;
     ; GFX9-LABEL: name: store_global_s32_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 13f788520710513..ce27598f69b3f1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -1638,21 +1638,21 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %
 define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 {
 ; GFX7-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s3, 0x40200000
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x40200000
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1]
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s2, 0
-; GFX8-NEXT:    s_mov_b32 s3, 0x40200000
-; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x40200000
+; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
@@ -1662,22 +1662,18 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou
 ;
 ; GFX10-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:    s_mov_b32 s3, 0x40200000
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0x40200000
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 02d1a92c6937393..4ab963ed85ccada 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -25,11 +25,8 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX10-LABEL: global_atomic_csub_offset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s5
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -37,12 +34,8 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX11-LABEL: global_atomic_csub_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0x1000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -73,11 +66,8 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX10-LABEL: global_atomic_csub_offset_nortn:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s5
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -85,12 +75,8 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX11-LABEL: global_atomic_csub_offset_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0x1000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index aecfbe7aa226067..3ccedb0733d51d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -317,31 +317,31 @@ bb:
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
 ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GCN-NEXT:    s_mov_b64 s[10:11], 1.0
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x34
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    s_mov_b64 s[6:7], 1.0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1]
-; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; GCN-NEXT:    v_accvgpr_write_b32 a0, s4
-; GCN-NEXT:    v_accvgpr_write_b32 a1, s5
-; GCN-NEXT:    v_accvgpr_write_b32 a2, s6
-; GCN-NEXT:    v_accvgpr_write_b32 a3, s7
-; GCN-NEXT:    v_accvgpr_write_b32 a4, s8
-; GCN-NEXT:    v_accvgpr_write_b32 a5, s9
-; GCN-NEXT:    v_accvgpr_write_b32 a6, s10
-; GCN-NEXT:    v_accvgpr_write_b32 a7, s11
-; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s4
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[12:13]
-; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
 ; GCN-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index 0ec4f64b38a1ba1..a4d5fe4ffa5a758 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -12,11 +12,12 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
 ; LOOP-NEXT:    s_xor_b64 s[4:5], exec, s[0:1]
 ; LOOP-NEXT:    s_cbranch_execz .LBB0_3
 ; LOOP-NEXT:  ; %bb.1: ; %copy_forward
-; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    s_mov_b64 s[6:7], 0
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
-; LOOP-NEXT:    v_mov_b32_e32 v5, s1
-; LOOP-NEXT:    v_mov_b32_e32 v4, s0
+; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    v_mov_b32_e32 v4, s6
+; LOOP-NEXT:    v_mov_b32_e32 v5, s7
 ; LOOP-NEXT:  .LBB0_2: ; %copy_forward_loop
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index 7cd3babc7090933..3edd2e0914a6eaa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -7,11 +7,12 @@ declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
 define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
 ; LOOP-LABEL: memset_p1i8:
 ; LOOP:       ; %bb.0: ; %loadstoreloop.preheader
-; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    s_mov_b64 s[4:5], 0
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
-; LOOP-NEXT:    v_mov_b32_e32 v4, s1
-; LOOP-NEXT:    v_mov_b32_e32 v3, s0
+; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    v_mov_b32_e32 v3, s4
+; LOOP-NEXT:    v_mov_b32_e32 v4, s5
 ; LOOP-NEXT:  .LBB0_1: ; %loadstoreloop
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; LOOP-NEXT:    v_add_i32_e32 v5, vcc, v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4248f7b6a158312..9dacdbc46be1948 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2259,12 +2259,13 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-LABEL: v_sdiv_i64_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
-; CHECK-NEXT:    v_mov_b32_e32 v4, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
-; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_lshl_b64 v[5:6], v[0:1], v2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2717,161 +2718,163 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
-; CGP-NEXT:    v_mov_b32_e32 v5, v2
-; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
+; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
+; CGP-NEXT:    v_lshl_b64 v[11:12], s[4:5], v4
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
-; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
+; CGP-NEXT:    v_or_b32_e32 v1, v9, v12
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
+; CGP-NEXT:    v_mov_b32_e32 v5, v2
+; CGP-NEXT:    v_mov_b32_e32 v7, v3
+; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v1, v0
-; CGP-NEXT:    v_xor_b32_e32 v1, v3, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v2
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v1, vcc
-; CGP-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; CGP-NEXT:    v_trunc_f32_e32 v10, v4
-; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v10
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v15, v3
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
-; CGP-NEXT:    v_mul_hi_u32 v11, v12, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v10
-; CGP-NEXT:    v_mul_lo_u32 v17, v15, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v12, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v16, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v17, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
-; CGP-NEXT:    v_mul_hi_u32 v10, v15, v10
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v3
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, v15, v4, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v13
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v13, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v4, v13
-; CGP-NEXT:    v_mul_lo_u32 v4, v15, v3
-; CGP-NEXT:    v_mul_lo_u32 v9, v12, v10
-; CGP-NEXT:    v_xor_b32_e32 v14, v8, v13
-; CGP-NEXT:    v_mul_hi_u32 v8, v12, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v15, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v9, v12, v10
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v15, v10
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v14, v3
-; CGP-NEXT:    v_mul_lo_u32 v9, v11, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v14, v3
-; CGP-NEXT:    v_mul_hi_u32 v12, v14, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v12
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v0
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v12, v0, vcc
+; CGP-NEXT:    v_xor_b32_e32 v4, v1, v0
+; CGP-NEXT:    v_xor_b32_e32 v1, v10, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v11, v1
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT:    v_mac_f32_e32 v10, 0x4f800000, v11
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; CGP-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
+; CGP-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v10
+; CGP-NEXT:    v_trunc_f32_e32 v12, v11
+; CGP-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v12
+; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v10
+; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v12
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
+; CGP-NEXT:    v_mul_hi_u32 v17, v13, v10
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; CGP-NEXT:    v_mul_lo_u32 v12, v16, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT:    v_mul_lo_u32 v18, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v19, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; CGP-NEXT:    v_mul_hi_u32 v17, v13, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v18, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v19, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v10
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v11, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v8, v14
+; CGP-NEXT:    v_mul_lo_u32 v8, v16, v10
+; CGP-NEXT:    v_mul_lo_u32 v15, v13, v11
+; CGP-NEXT:    v_xor_b32_e32 v17, v9, v14
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v14, v4
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_mul_hi_u32 v9, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
+; CGP-NEXT:    v_mul_hi_u32 v15, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
+; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v3, v8
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v8
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v12, v[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v10, v[8:9]
-; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v8, vcc
-; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v14, v8
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_subb_u32_e32 v8, vcc, v8, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v2
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v4, v9, v11, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v10
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v14, v2, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v9
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v11, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v16, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v17, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v12, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v17, v8
+; CGP-NEXT:    v_mul_hi_u32 v15, v17, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v11, v12, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v8, v10
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v10
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10]
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10]
+; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v17, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v17, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
+; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v1
+; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v10, v12, v15, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v13
+; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v11, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v16, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v12
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v15, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v1, v3
-; CGP-NEXT:    v_xor_b32_e32 v1, v2, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v15, v8, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v8, v14, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v1, v8
+; CGP-NEXT:    v_xor_b32_e32 v1, v4, v8
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr11_vgpr12
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
-; CGP-NEXT:    s_or_saveexec_b64 s[8:9], s[8:9]
-; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
-; CGP-NEXT:    s_xor_b64 exec, exec, s[8:9]
+; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; CGP-NEXT:    v_lshl_b64 v[9:10], v[2:3], v6
+; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v11
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v11
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -2879,19 +2882,19 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v1, v0, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, v0, v11
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  .LBB8_4:
-; CGP-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    v_or_b32_e32 v3, v7, v10
 ; CGP-NEXT:    v_mov_b32_e32 v2, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index d0c55c69f508775..d1599ac489a5f51 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -2219,12 +2219,13 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-LABEL: v_srem_i64_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
-; CHECK-NEXT:    v_mov_b32_e32 v4, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
-; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_lshl_b64 v[5:6], v[0:1], v2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2671,159 +2672,164 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
-; CGP-NEXT:    v_mov_b32_e32 v5, v2
-; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
+; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
+; CGP-NEXT:    v_lshl_b64 v[11:12], s[4:5], v4
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
-; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
+; CGP-NEXT:    v_or_b32_e32 v1, v9, v12
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
+; CGP-NEXT:    v_mov_b32_e32 v5, v2
+; CGP-NEXT:    v_mov_b32_e32 v7, v3
+; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
-; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v2, v1
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v3, v1, vcc
+; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v12
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v11, v1
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v12, v1, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v1
-; CGP-NEXT:    v_xor_b32_e32 v1, v2, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v0
-; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v1, vcc
-; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; CGP-NEXT:    v_trunc_f32_e32 v4, v3
-; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; CGP-NEXT:    v_xor_b32_e32 v1, v4, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v1
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v0
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v10
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v4
+; CGP-NEXT:    v_trunc_f32_e32 v12, v10
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v12
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v4
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4]
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4]
-; CGP-NEXT:    v_mul_lo_u32 v4, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v13, v2
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v3
-; CGP-NEXT:    v_mul_lo_u32 v16, v13, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v16, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v2
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v13, v3, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4]
-; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v11, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v4, v11
-; CGP-NEXT:    v_mul_lo_u32 v4, v13, v2
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v3
-; CGP-NEXT:    v_mul_hi_u32 v2, v13, v2
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v11
+; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v12
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mov_b32_e32 v4, v11
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v4, v16, v10
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; CGP-NEXT:    v_mul_hi_u32 v12, v13, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT:    v_mul_lo_u32 v17, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v18, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT:    v_mul_hi_u32 v12, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v10, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v14, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v17, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v18, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v13, v3, vcc
-; CGP-NEXT:    v_mul_lo_u32 v4, v8, v2
-; CGP-NEXT:    v_mul_lo_u32 v10, v9, v3
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v8, v2
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v3
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
+; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v9, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v2, v4
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v10, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mov_b32_e32 v4, v11
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v14
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v4, v14
+; CGP-NEXT:    v_mul_lo_u32 v4, v16, v10
+; CGP-NEXT:    v_mul_lo_u32 v9, v13, v11
+; CGP-NEXT:    v_xor_b32_e32 v15, v8, v14
+; CGP-NEXT:    v_mul_hi_u32 v8, v13, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v12, v[3:4]
-; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v8, v3, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v8, v3
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v0
-; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v0
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v16, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v15, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v12, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v12, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, v15, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v15, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v12, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v4, v9
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v4
+; CGP-NEXT:    v_mov_b32_e32 v4, v9
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10]
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v15, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v15, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v1
+; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v4, v0
+; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v9, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v1
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v9, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v1
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v9, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v11
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v11
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v11, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v14
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v14
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr11_vgpr12
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
-; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
-; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
+; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
+; CGP-NEXT:    v_lshl_b64 v[9:10], v[2:3], v6
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v11
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v11
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -2831,13 +2837,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v2
+; CGP-NEXT:    v_mul_lo_u32 v0, v0, v11
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v8, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v11
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v11
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  .LBB8_4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 77737b356ff6e9c..b3b57e14cb3fb55 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -1070,11 +1070,12 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v1
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
-; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_lshl_b64 v[5:6], v[0:1], v2
+; CHECK-NEXT:    v_or_b32_e32 v8, v4, v6
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1509,20 +1510,22 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
+; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v10, 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v11, 0
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v3
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v4
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1531,105 +1534,105 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v13, v10, v0
-; CGP-NEXT:    v_mul_hi_u32 v14, v1, v0
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v0, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v12
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v14, v0, v11
-; CGP-NEXT:    v_mul_lo_u32 v16, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v11
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v15, v12, v0
+; CGP-NEXT:    v_mul_hi_u32 v16, v1, v0
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v14
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_mul_lo_u32 v16, v0, v13
+; CGP-NEXT:    v_mul_lo_u32 v18, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v19, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v18, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v17
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v13, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v12, v0
+; CGP-NEXT:    v_mul_hi_u32 v14, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v16, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
+; CGP-NEXT:    v_mul_lo_u32 v12, v0, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v4, v1
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v1
+; CGP-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v11, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v10, v10, v0
-; CGP-NEXT:    v_mul_hi_u32 v12, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v0, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v11
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
-; CGP-NEXT:    v_mul_lo_u32 v10, v0, v1
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v1
-; CGP-NEXT:    v_mul_hi_u32 v15, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v10, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v1
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v1
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v1
+; CGP-NEXT:    v_mul_hi_u32 v15, v8, v1
 ; CGP-NEXT:    v_mul_hi_u32 v1, v9, v1
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v2, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v0
-; CGP-NEXT:    v_mul_hi_u32 v12, v2, v0
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v13, v3, v0
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v4, v2, v1
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v0
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v13
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v9, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, 1, v0
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v15
+; CGP-NEXT:    v_addc_u32_e32 v17, vcc, 0, v16, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v12
+; CGP-NEXT:    v_subb_u32_e64 v12, s[4:5], v9, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v9, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v3, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v12, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v9, v14, v9, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v2
@@ -1639,8 +1642,8 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v13, v11, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v14, v15, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v15, v13, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v16, v17, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1648,9 +1651,9 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
-; CGP-NEXT:    s_or_saveexec_b64 s[8:9], s[8:9]
-; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
-; CGP-NEXT:    s_xor_b64 exec, exec, s[8:9]
+; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; CGP-NEXT:    v_lshl_b64 v[9:10], v[10:11], v6
+; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v4
@@ -1673,7 +1676,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  .LBB8_4:
-; CGP-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    v_or_b32_e32 v3, v7, v10
 ; CGP-NEXT:    v_mov_b32_e32 v2, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 097f6642cbc669b..ecf7cc921886c08 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -1576,11 +1576,12 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v1
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
-; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_lshl_b64 v[5:6], v[0:1], v2
+; CHECK-NEXT:    v_or_b32_e32 v8, v4, v6
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2010,20 +2011,22 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
+; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v10, 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v11, 0
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v3
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v4
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2032,92 +2035,92 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v13, v10, v0
-; CGP-NEXT:    v_mul_hi_u32 v14, v1, v0
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v0, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v12
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v14, v0, v11
-; CGP-NEXT:    v_mul_lo_u32 v16, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v11
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v15, v12, v0
+; CGP-NEXT:    v_mul_hi_u32 v16, v1, v0
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v14
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_mul_lo_u32 v16, v0, v13
+; CGP-NEXT:    v_mul_lo_u32 v18, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v19, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v18, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v17
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v13, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v12, v0
+; CGP-NEXT:    v_mul_hi_u32 v14, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v16, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
+; CGP-NEXT:    v_mul_lo_u32 v12, v0, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v4, v1
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v1
+; CGP-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v11, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v10, v10, v0
-; CGP-NEXT:    v_mul_hi_u32 v12, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v0, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v11
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
-; CGP-NEXT:    v_mul_lo_u32 v10, v0, v1
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v1
-; CGP-NEXT:    v_mul_hi_u32 v15, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v10, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v1
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v1
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v1
+; CGP-NEXT:    v_mul_hi_u32 v15, v8, v1
 ; CGP-NEXT:    v_mul_hi_u32 v1, v9, v1
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v2, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v0
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v13, v3, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v2, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v1, v2, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v10
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v12
 ; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v9, v0, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v9, v0
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v2
@@ -2128,19 +2131,19 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v1, v2
-; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v0, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v0, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v11, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v12, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v14, v13, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
@@ -2148,8 +2151,8 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
-; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
-; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
+; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
+; CGP-NEXT:    v_lshl_b64 v[9:10], v[10:11], v6
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 8c4483fc118dbb8..bf5843ea8047d5f 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -691,12 +691,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v0
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, s9
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GFX90A-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s9
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v3
 ; GFX90A-NEXT:    s_mul_i32 s4, s4, s10
 ; GFX90A-NEXT:    s_mul_hi_u32 s4, s10, s4
 ; GFX90A-NEXT:    s_add_i32 s10, s10, s4
@@ -713,7 +713,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_cselect_b32 s4, s10, s4
 ; GFX90A-NEXT:    s_lshr_b32 s9, s9, 16
 ; GFX90A-NEXT:    s_lshl_b64 s[12:13], s[4:5], 5
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v1, s9
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s9
 ; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
 ; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[6:7], 5
 ; GFX90A-NEXT:    s_or_b32 s10, s10, 28
@@ -737,7 +737,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_10
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
 ; GFX90A-NEXT:    s_mov_b32 s9, s8
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
@@ -795,7 +795,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v22, v21
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[0:1], v[14:15]
+; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[2:3], v[14:15]
 ; GFX90A-NEXT:    v_pk_add_f32 v[26:27], v[14:15], 0 op_sel_hi:[1,0]
 ; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[22:23], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[14:15], v[20:21], v[14:15]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 7eb1cb926c190de..6d7fda124df2f5e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7972,8 +7972,7 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
+; GFX6-NEXT:    s_lshl_b64 s[4:5], 0x1000, s8
 ; GFX6-NEXT:    s_add_u32 s4, s4, -1
 ; GFX6-NEXT:    s_addc_u32 s5, s5, -1
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
@@ -7986,10 +7985,9 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0x1000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s2
 ; GFX9-NEXT:    s_add_u32 s0, s0, -1
 ; GFX9-NEXT:    s_addc_u32 s1, s1, -1
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
@@ -8064,46 +8062,44 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ;
 ; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
-; GFX6-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[12:13], s6
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[12:13], s4
-; GFX6-NEXT:    s_add_u32 s4, s4, -1
-; GFX6-NEXT:    s_addc_u32 s5, s5, -1
-; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_add_u32 s4, s6, -1
-; GFX6-NEXT:    s_addc_u32 s5, s7, -1
-; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_mov_b32_e32 v3, s3
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NEXT:    s_lshl_b64 s[10:11], 0x1000, s10
+; GFX6-NEXT:    s_lshl_b64 s[8:9], 0x1000, s8
+; GFX6-NEXT:    s_add_u32 s8, s8, -1
+; GFX6-NEXT:    s_addc_u32 s9, s9, -1
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_add_u32 s8, s10, -1
+; GFX6-NEXT:    s_addc_u32 s9, s11, -1
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 0x1000, s10
+; GFX9-NEXT:    s_lshl_b64 s[8:9], 0x1000, s8
+; GFX9-NEXT:    s_add_u32 s8, s8, -1
+; GFX9-NEXT:    s_addc_u32 s9, s9, -1
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9-NEXT:    s_add_u32 s2, s2, -1
 ; GFX9-NEXT:    s_addc_u32 s3, s3, -1
-; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
-; GFX9-NEXT:    s_add_u32 s4, s10, -1
-; GFX9-NEXT:    s_addc_u32 s5, s11, -1
-; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
@@ -8364,12 +8360,11 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_lshl_b64 s[2:3], 0x1000, s2
 ; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX6-NEXT:    s_add_u32 s2, s2, s8
 ; GFX6-NEXT:    s_mov_b32 s9, s8
@@ -8401,17 +8396,16 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
+; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -8419,6 +8413,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
@@ -8501,10 +8496,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
+; GFX9-NEXT:    s_lshl_b64 s[4:5], 0x1000, s2
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
 ; GFX9-NEXT:    s_add_u32 s4, s4, s2
 ; GFX9-NEXT:    s_mov_b32 s3, s2
@@ -8530,12 +8524,12 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_mul_hi_u32 s14, s0, s11
 ; GFX9-NEXT:    s_mul_i32 s13, s1, s11
 ; GFX9-NEXT:    s_add_i32 s12, s14, s12
-; GFX9-NEXT:    s_add_i32 s12, s12, s13
 ; GFX9-NEXT:    s_mul_i32 s15, s0, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_hi_u32 s14, s11, s15
 ; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
-; GFX9-NEXT:    s_mul_i32 s14, s11, s12
-; GFX9-NEXT:    s_mul_hi_u32 s11, s11, s15
-; GFX9-NEXT:    s_add_u32 s11, s11, s14
+; GFX9-NEXT:    s_mul_i32 s11, s11, s12
+; GFX9-NEXT:    s_add_u32 s11, s14, s11
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
 ; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s15
 ; GFX9-NEXT:    s_mul_i32 s15, s10, s15
@@ -8954,10 +8948,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], s10
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX6-NEXT:    s_lshl_b64 s[2:3], 0x1000, s8
+; GFX6-NEXT:    s_lshl_b64 s[12:13], 0x1000, s10
 ; GFX6-NEXT:    s_ashr_i32 s14, s3, 31
 ; GFX6-NEXT:    s_add_u32 s2, s2, s14
 ; GFX6-NEXT:    s_mov_b32 s15, s14
@@ -8990,15 +8983,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -9212,11 +9205,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 0x1000, s8
+; GFX9-NEXT:    s_lshl_b64 s[10:11], 0x1000, s10
 ; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s8
 ; GFX9-NEXT:    s_mov_b32 s9, s8
@@ -9241,8 +9233,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_hi_u32 s18, s0, s15
 ; GFX9-NEXT:    s_mul_i32 s17, s1, s15
 ; GFX9-NEXT:    s_add_i32 s16, s18, s16
-; GFX9-NEXT:    s_add_i32 s16, s16, s17
 ; GFX9-NEXT:    s_mul_i32 s19, s0, s15
+; GFX9-NEXT:    s_add_i32 s16, s16, s17
 ; GFX9-NEXT:    s_mul_hi_u32 s17, s15, s16
 ; GFX9-NEXT:    s_mul_i32 s18, s15, s16
 ; GFX9-NEXT:    s_mul_hi_u32 s15, s15, s19
@@ -9764,12 +9756,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_lshl_b64 s[2:3], 0x1000, s2
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    s_add_u32 s2, s2, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
@@ -9801,17 +9792,16 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
+; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -9819,6 +9809,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
@@ -9899,10 +9890,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 0x1000, s2
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
@@ -9928,12 +9918,12 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_mul_hi_u32 s12, s0, s3
 ; GFX9-NEXT:    s_mul_i32 s11, s1, s3
 ; GFX9-NEXT:    s_add_i32 s10, s12, s10
-; GFX9-NEXT:    s_add_i32 s10, s10, s11
 ; GFX9-NEXT:    s_mul_i32 s13, s0, s3
+; GFX9-NEXT:    s_add_i32 s10, s10, s11
+; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s13
 ; GFX9-NEXT:    s_mul_hi_u32 s11, s3, s10
-; GFX9-NEXT:    s_mul_i32 s12, s3, s10
-; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s13
-; GFX9-NEXT:    s_add_u32 s3, s3, s12
+; GFX9-NEXT:    s_mul_i32 s3, s3, s10
+; GFX9-NEXT:    s_add_u32 s3, s12, s3
 ; GFX9-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX9-NEXT:    s_mul_hi_u32 s14, s2, s13
 ; GFX9-NEXT:    s_mul_i32 s13, s2, s13
@@ -10138,11 +10128,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX6-NEXT:    s_lshl_b64 s[2:3], 0x1000, s8
+; GFX6-NEXT:    s_lshl_b64 s[16:17], 0x1000, s10
 ; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX6-NEXT:    s_add_u32 s2, s2, s8
 ; GFX6-NEXT:    s_mov_b32 s9, s8
@@ -10176,15 +10165,15 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -10392,11 +10381,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 0x1000, s8
+; GFX9-NEXT:    s_lshl_b64 s[10:11], 0x1000, s10
 ; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s8
 ; GFX9-NEXT:    s_mov_b32 s9, s8
@@ -10421,8 +10409,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_hi_u32 s16, s0, s3
 ; GFX9-NEXT:    s_mul_i32 s15, s1, s3
 ; GFX9-NEXT:    s_add_i32 s14, s16, s14
-; GFX9-NEXT:    s_add_i32 s14, s14, s15
 ; GFX9-NEXT:    s_mul_i32 s17, s0, s3
+; GFX9-NEXT:    s_add_i32 s14, s14, s15
 ; GFX9-NEXT:    s_mul_hi_u32 s15, s3, s14
 ; GFX9-NEXT:    s_mul_i32 s16, s3, s14
 ; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s17
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index abd9a4159f8ccd9..60c313444521822 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -4416,9 +4416,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
@@ -4452,8 +4452,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 5
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
@@ -4966,9 +4966,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
@@ -5002,8 +5002,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 5
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
@@ -5516,9 +5516,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
@@ -5551,8 +5551,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 5
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
@@ -6061,9 +6061,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB24_2
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
@@ -6096,8 +6096,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cbranch_execz .LBB24_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 5
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 6efbd6ce87385ee..8082a0646d4a139 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -2533,8 +2533,7 @@ define i1 @test124(i32 %arg1, i64 %arg2) {
 ; GCN-LABEL: test124:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[0:1], 0x3e8
-; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[1:2]
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0x3e8, v[1:2]
 ; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index c703a1dd7734dc9..6997913f1ae1617 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -250,8 +250,8 @@ define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspa
 ; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
 
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
-; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
-; GCN: v_cmp_gt_u64_e32 vcc, s[[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x41
+; GCN: v_cmp_gt_u64_e32 vcc, [[K]], v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index bd0c2b30eb5deab..53c9861b7a0516b 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -310,8 +310,7 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) {
 ;
 ; GISEL-LABEL: s_csh_64_1:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b64 s[4:5], 0xff
-; GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], 0xff
 ; GISEL-NEXT:    s_lshl_b64 s[4:5], s[0:1], s2
 ; GISEL-NEXT:    s_lshr_b64 s[6:7], s[0:1], s2
 ; GISEL-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 86d0df494bcacd7..d9e0ddd3b904486 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -27,29 +27,27 @@ define float @v_mul_42_f32(float %x) {
 }
 
 define double @v_mul_42_f64(double %x) {
-; GFX9-LABEL: v_mul_42_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40450000
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_mul_42_f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40450000
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_mul_42_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40450000
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_mul_42_f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40450000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_mul_42_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40450000
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_42_f64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_mul_f64 v[0:1], 0x40450000, v[0:1]
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 42.0
   ret double %mul
 }
@@ -726,9 +724,9 @@ define double @v_mul_0x1pn1031_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn1031_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s5, 0x800
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x800
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn1031_f64:
@@ -740,9 +738,7 @@ define double @v_mul_0x1pn1031_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn1031_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_movk_i32 s5, 0x800
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn1031_f64:
@@ -754,9 +750,7 @@ define double @v_mul_0x1pn1031_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn1031_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_movk_i32 s1, 0x800
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 4.34584737989687770135e-311
   ret double %mul
@@ -774,9 +768,9 @@ define double @v_mul_0x1pn1022_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn1022_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x100000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x100000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn1022_f64:
@@ -788,9 +782,7 @@ define double @v_mul_0x1pn1022_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn1022_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x100000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x100000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn1022_f64:
@@ -802,9 +794,7 @@ define double @v_mul_0x1pn1022_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn1022_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x100000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x100000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 2.22507385850720138309e-308
   ret double %mul
@@ -822,9 +812,9 @@ define double @v_mul_0x1pn1021_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn1021_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x200000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x200000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn1021_f64:
@@ -836,9 +826,7 @@ define double @v_mul_0x1pn1021_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn1021_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x200000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x200000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn1021_f64:
@@ -850,9 +838,7 @@ define double @v_mul_0x1pn1021_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn1021_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x200000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x200000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 4.45014771701440276618e-308
   ret double %mul
@@ -870,9 +856,9 @@ define double @v_mul_0x1pn64_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn64_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x3bf00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3bf00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn64_f64:
@@ -884,9 +870,7 @@ define double @v_mul_0x1pn64_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn64_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x3bf00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3bf00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn64_f64:
@@ -898,9 +882,7 @@ define double @v_mul_0x1pn64_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn64_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x3bf00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3bf00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 5.42101086242752217004e-20
   ret double %mul
@@ -918,9 +900,9 @@ define double @v_mul_0x1pn17_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn17_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x3ee00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3ee00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn17_f64:
@@ -932,9 +914,7 @@ define double @v_mul_0x1pn17_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn17_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x3ee00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ee00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn17_f64:
@@ -946,9 +926,7 @@ define double @v_mul_0x1pn17_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn17_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x3ee00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ee00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.00000762939453125
   ret double %mul
@@ -965,9 +943,9 @@ define double @v_mul_0x1pn16_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn16_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x3ef00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3ef00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn16_f64:
@@ -979,9 +957,7 @@ define double @v_mul_0x1pn16_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn16_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x3ef00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ef00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn16_f64:
@@ -993,9 +969,7 @@ define double @v_mul_0x1pn16_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn16_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x3ef00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ef00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.0000152587890625
   ret double %mul
@@ -1012,9 +986,9 @@ define double @v_mul_0x1pn15_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn15_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0.5
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0.5
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn15_f64:
@@ -1026,9 +1000,7 @@ define double @v_mul_0x1pn15_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn15_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0.5
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3f000000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn15_f64:
@@ -1040,9 +1012,7 @@ define double @v_mul_0x1pn15_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn15_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0.5
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3f000000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.000030517578125
   ret double %mul
@@ -1058,9 +1028,9 @@ define double @v_mul_neg256_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg256_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0700000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0700000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg256_f64:
@@ -1072,9 +1042,7 @@ define double @v_mul_neg256_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg256_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0700000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg256_f64:
@@ -1086,9 +1054,7 @@ define double @v_mul_neg256_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg256_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0700000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -256.0
   ret double %mul
@@ -1104,9 +1070,9 @@ define double @v_mul_neg128_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg128_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0600000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0600000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg128_f64:
@@ -1118,9 +1084,7 @@ define double @v_mul_neg128_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg128_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0600000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0600000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg128_f64:
@@ -1132,9 +1096,7 @@ define double @v_mul_neg128_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg128_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0600000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0600000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -128.0
   ret double %mul
@@ -1150,9 +1112,9 @@ define double @v_mul_neg64_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg64_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0500000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0500000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg64_f64:
@@ -1164,9 +1126,7 @@ define double @v_mul_neg64_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg64_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0500000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0500000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg64_f64:
@@ -1178,9 +1138,7 @@ define double @v_mul_neg64_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg64_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0500000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0500000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -64.0
   ret double %mul
@@ -1196,9 +1154,9 @@ define double @v_mul_neg32_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg32_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0400000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg32_f64:
@@ -1210,9 +1168,7 @@ define double @v_mul_neg32_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg32_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0400000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg32_f64:
@@ -1224,9 +1180,7 @@ define double @v_mul_neg32_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg32_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0400000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -32.0
   ret double %mul
@@ -1242,9 +1196,9 @@ define double @v_mul_neg16_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg16_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0300000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0300000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg16_f64:
@@ -1256,9 +1210,7 @@ define double @v_mul_neg16_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg16_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0300000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0300000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg16_f64:
@@ -1270,9 +1222,7 @@ define double @v_mul_neg16_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg16_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0300000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0300000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -16.0
   ret double %mul
@@ -1288,9 +1238,9 @@ define double @v_mul_neg8_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg8_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0200000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0200000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg8_f64:
@@ -1302,9 +1252,7 @@ define double @v_mul_neg8_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg8_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0200000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg8_f64:
@@ -1316,9 +1264,7 @@ define double @v_mul_neg8_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg8_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0200000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -8.0
   ret double %mul
@@ -1414,9 +1360,9 @@ define double @v_mul_neg_quarter_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg_quarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xbfd00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbfd00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg_quarter_f64:
@@ -1428,9 +1374,7 @@ define double @v_mul_neg_quarter_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg_quarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xbfd00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg_quarter_f64:
@@ -1442,9 +1386,7 @@ define double @v_mul_neg_quarter_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg_quarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xbfd00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -0.25
   ret double %mul
@@ -1460,9 +1402,9 @@ define double @v_mul_quarter_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_quarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x3fd00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fd00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_quarter_f64:
@@ -1474,9 +1416,7 @@ define double @v_mul_quarter_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_quarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x3fd00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_quarter_f64:
@@ -1488,9 +1428,7 @@ define double @v_mul_quarter_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_quarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x3fd00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.25
   ret double %mul
@@ -1575,9 +1513,9 @@ define double @v_mul_8_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_8_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40200000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_8_f64:
@@ -1589,9 +1527,7 @@ define double @v_mul_8_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_8_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_8_f64:
@@ -1603,9 +1539,7 @@ define double @v_mul_8_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_8_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40200000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 8.0
   ret double %mul
@@ -1621,9 +1555,9 @@ define double @v_mul_16_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_16_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40300000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40300000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_16_f64:
@@ -1635,9 +1569,7 @@ define double @v_mul_16_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_16_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40300000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40300000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_16_f64:
@@ -1649,9 +1581,7 @@ define double @v_mul_16_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_16_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40300000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40300000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 16.0
   ret double %mul
@@ -1667,9 +1597,9 @@ define double @v_mul_32_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_32_f64:
@@ -1681,9 +1611,7 @@ define double @v_mul_32_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_32_f64:
@@ -1695,9 +1623,7 @@ define double @v_mul_32_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 32.0
   ret double %mul
@@ -1713,9 +1639,9 @@ define double @v_mul_64_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_64_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40500000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40500000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_64_f64:
@@ -1727,9 +1653,7 @@ define double @v_mul_64_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_64_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40500000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40500000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_64_f64:
@@ -1741,9 +1665,7 @@ define double @v_mul_64_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_64_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40500000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40500000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 64.0
   ret double %mul
@@ -1759,9 +1681,9 @@ define double @v_mul_128_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_128_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40600000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40600000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_128_f64:
@@ -1773,9 +1695,7 @@ define double @v_mul_128_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_128_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40600000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40600000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_128_f64:
@@ -1787,9 +1707,7 @@ define double @v_mul_128_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_128_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40600000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40600000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 128.0
   ret double %mul
@@ -1805,9 +1723,9 @@ define double @v_mul_256_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_256_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40700000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_256_f64:
@@ -1819,9 +1737,7 @@ define double @v_mul_256_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_256_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40700000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_256_f64:
@@ -1833,9 +1749,7 @@ define double @v_mul_256_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_256_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40700000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40700000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 256.0
   ret double %mul
@@ -1852,9 +1766,9 @@ define double @v_mul_0x1p63_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p63_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x43e00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43e00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p63_f64:
@@ -1866,9 +1780,7 @@ define double @v_mul_0x1p63_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p63_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x43e00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43e00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p63_f64:
@@ -1880,9 +1792,7 @@ define double @v_mul_0x1p63_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p63_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x43e00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43e00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 9223372036854775808.0
   ret double %mul
@@ -1899,9 +1809,9 @@ define double @v_mul_0x1p64_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p64_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x43f00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43f00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p64_f64:
@@ -1913,9 +1823,7 @@ define double @v_mul_0x1p64_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p64_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x43f00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43f00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p64_f64:
@@ -1927,9 +1835,7 @@ define double @v_mul_0x1p64_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p64_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x43f00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43f00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 18446744073709551616.0
   ret double %mul
@@ -1947,9 +1853,9 @@ define double @v_mul_0x1p65_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p65_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 34
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 34
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p65_f64:
@@ -1961,9 +1867,7 @@ define double @v_mul_0x1p65_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p65_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 34
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x44000000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p65_f64:
@@ -1975,9 +1879,7 @@ define double @v_mul_0x1p65_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p65_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 34
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x44000000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 36893488147419103232.0
   ret double %mul
@@ -1994,10 +1896,8 @@ define amdgpu_ps <2 x i32> @s_mul_0x1p65_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX9-GISEL-LABEL: s_mul_0x1p65_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX9-GISEL-NEXT:    s_brev_b32 s3, 34
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v1, 34
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
 ; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
@@ -2012,9 +1912,7 @@ define amdgpu_ps <2 x i32> @s_mul_0x1p65_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX10-GISEL-LABEL: s_mul_0x1p65_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX10-GISEL-NEXT:    s_brev_b32 s3, 34
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], s[2:3]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x44000000, s[0:1]
 ; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-GISEL-NEXT:    ; return to shader part epilog
@@ -2028,9 +1926,7 @@ define amdgpu_ps <2 x i32> @s_mul_0x1p65_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX11-GISEL-LABEL: s_mul_0x1p65_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX11-GISEL-NEXT:    s_brev_b32 s3, 34
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], s[2:3]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x44000000, s[0:1]
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-GISEL-NEXT:    ; return to shader part epilog
@@ -2057,9 +1953,9 @@ define double @v_mul_0x1p128_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p128_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x47f00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x47f00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p128_f64:
@@ -2071,9 +1967,7 @@ define double @v_mul_0x1p128_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p128_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x47f00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x47f00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p128_f64:
@@ -2085,9 +1979,7 @@ define double @v_mul_0x1p128_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p128_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x47f00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x47f00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 3.40282366920938463463e+38
   ret double %mul
@@ -2105,9 +1997,9 @@ define double @v_mul_0x1p1022_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p1022_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x7fd00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fd00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p1022_f64:
@@ -2119,9 +2011,7 @@ define double @v_mul_0x1p1022_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p1022_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x7fd00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fd00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p1022_f64:
@@ -2133,9 +2023,7 @@ define double @v_mul_0x1p1022_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p1022_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x7fd00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fd00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 4.49423283715578976932e+307
   ret double %mul
@@ -2153,9 +2041,9 @@ define double @v_mul_0x1p1023_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p1023_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x7fe00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fe00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p1023_f64:
@@ -2167,9 +2055,7 @@ define double @v_mul_0x1p1023_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p1023_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x7fe00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fe00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p1023_f64:
@@ -2181,9 +2067,7 @@ define double @v_mul_0x1p1023_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p1023_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x7fe00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fe00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 8.98846567431157953865e+307
   ret double %mul
@@ -2191,29 +2075,27 @@ define double @v_mul_0x1p1023_f64(double %x) {
 
 ; Check that this doesn't interfer with fma formation
 define double @v_fma_mul_add_32_f64(double %x, double %y) {
-; GFX9-LABEL: v_fma_mul_add_32_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_fma_mul_add_32_f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fma_mul_add_32_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_fma_mul_add_32_f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fma_mul_add_32_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], s[0:1], v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_fma_mul_add_32_f64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[2:3]
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, 32.0
   %fma = fadd contract double %mul, %y
   ret double %fma
@@ -2229,23 +2111,33 @@ define <2 x double> @v_fma_mul_add_32_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fma_mul_add_32_v2f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_fma_mul_add_32_v2f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fma_mul_add_32_v2f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], s[0:1], v[4:5]
-; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], v[6:7]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-GISEL-LABEL: v_fma_mul_add_32_v2f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_fma_mul_add_32_v2f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_fma_mul_add_32_v2f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5]
+; GFX11-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract <2 x double> %x, <double 32.0, double 32.0>
   %fma = fadd contract <2 x double> %mul, %y
   ret <2 x double> %fma
@@ -2319,9 +2211,9 @@ define double @v_mul_add_32_f64(double %x, double %y) {
 ; GFX9-GISEL-LABEL: v_mul_add_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2335,9 +2227,7 @@ define double @v_mul_add_32_f64(double %x, double %y) {
 ; GFX10-GISEL-LABEL: v_mul_add_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
 ; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2351,9 +2241,7 @@ define double @v_mul_add_32_f64(double %x, double %y) {
 ; GFX11-GISEL-LABEL: v_mul_add_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
 ; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 32.0
@@ -2456,58 +2344,54 @@ define double @v_mul_add_4_f64(double %x, double %y) {
 }
 
 define double @v_fma_mul_sub_32_f64(double %x, double %y) {
-; GFX9-LABEL: v_fma_mul_sub_32_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], -v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_fma_mul_sub_32_f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], -v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fma_mul_sub_32_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], -v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_fma_mul_sub_32_f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], -v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fma_mul_sub_32_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], s[0:1], -v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_fma_mul_sub_32_f64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], -v[2:3]
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, 32.0
   %fma = fsub contract double %mul, %y
   ret double %fma
 }
 
 define double @v_fma_mul_add_neg32_f64(double %x, double %y) {
-; GFX9-LABEL: v_fma_mul_add_neg32_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0xc0400000
-; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_fma_mul_add_neg32_f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc0400000
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fma_mul_add_neg32_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0xc0400000
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_fma_mul_add_neg32_f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0400000
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fma_mul_add_neg32_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0xc0400000
-; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], s[0:1], v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_fma_mul_add_neg32_f64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_fma_f64 v[0:1], 0xc0400000, v[0:1], v[2:3]
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, -32.0
   %fma = fadd contract double %mul, %y
   ret double %fma
@@ -2523,9 +2407,9 @@ define double @v_mul_fabs_32_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_32_f64:
@@ -2537,9 +2421,7 @@ define double @v_mul_fabs_32_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_32_f64:
@@ -2551,9 +2433,7 @@ define double @v_mul_fabs_32_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %x.fabs, 32.0
@@ -2561,29 +2441,27 @@ define double @v_mul_fabs_32_f64(double %x) {
 }
 
 define double @v_mul_add_fma_fabs_32_f64(double %x, double %y) {
-; GFX9-LABEL: v_mul_add_fma_fabs_32_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, s[4:5], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_mul_add_fma_fabs_32_f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, s[4:5], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_mul_add_fma_fabs_32_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, s[4:5], v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_mul_add_fma_fabs_32_f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, v[4:5], v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_mul_add_fma_fabs_32_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, s[0:1], v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_add_fma_fabs_32_f64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_fma_f64 v[0:1], 0x40400000, |v[0:1]|, v[2:3]
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %mul = fmul contract double %x.fabs, 32.0
   %fma = fadd contract double %mul, %y
@@ -2617,10 +2495,8 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) {
 ; GFX10-GISEL-LABEL: v_mul_16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40300000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40300000
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0x40300000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_16_v2f64:
@@ -2633,10 +2509,8 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) {
 ; GFX11-GISEL-LABEL: v_mul_16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40300000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40300000
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0x40300000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x double> %x, <double 16.0, double 16.0>
   ret <2 x double> %mul
@@ -2669,10 +2543,8 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0300000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0xc0300000
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0xc0300000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg16_v2f64:
@@ -2685,10 +2557,8 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0300000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0xc0300000
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0xc0300000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x double> %x, <double -16.0, double -16.0>
   ret <2 x double> %mul
@@ -2721,10 +2591,8 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40300000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
-; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 0x40300000
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, 0x40300000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_16_v2f64:
@@ -2737,10 +2605,8 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40300000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
-; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 0x40300000
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, 0x40300000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
   %mul = fmul <2 x double> %x.fabs, <double 16.0, double 16.0>
@@ -2757,10 +2623,8 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX9-GISEL-LABEL: s_mul_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
 ; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
@@ -2775,9 +2639,7 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX10-GISEL-LABEL: s_mul_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], s[2:3]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, s[0:1]
 ; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-GISEL-NEXT:    ; return to shader part epilog
@@ -2791,9 +2653,7 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX11-GISEL-LABEL: s_mul_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], s[2:3]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, s[0:1]
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-GISEL-NEXT:    ; return to shader part epilog
@@ -6885,9 +6745,9 @@ define double @v_constrained_fmul_32_f64(double %x, double %y) #0 {
 ; GFX9-GISEL-LABEL: v_constrained_fmul_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constrained_fmul_32_f64:
@@ -6899,9 +6759,7 @@ define double @v_constrained_fmul_32_f64(double %x, double %y) #0 {
 ; GFX10-GISEL-LABEL: v_constrained_fmul_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40400000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_constrained_fmul_32_f64:
@@ -6913,9 +6771,7 @@ define double @v_constrained_fmul_32_f64(double %x, double %y) #0 {
 ; GFX11-GISEL-LABEL: v_constrained_fmul_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40400000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double 32.0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret double %val
@@ -6931,9 +6787,9 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 {
 ; GFX9-GISEL-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x43f00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43f00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constrained_fmul_0x1p64_f64:
@@ -6945,9 +6801,7 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 {
 ; GFX10-GISEL-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x43f00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x43f00000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_constrained_fmul_0x1p64_f64:
@@ -6959,9 +6813,7 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 {
 ; GFX11-GISEL-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x43f00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x43f00000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double 18446744073709551616.0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret double %val
@@ -6988,9 +6840,9 @@ define double @v_mul_fabs_0x1pn1031_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s5, 0x800
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x800
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_0x1pn1031_f64:
@@ -7002,9 +6854,7 @@ define double @v_mul_fabs_0x1pn1031_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_movk_i32 s5, 0x800
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_0x1pn1031_f64:
@@ -7016,9 +6866,7 @@ define double @v_mul_fabs_0x1pn1031_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_movk_i32 s1, 0x800
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 4.34584737989687770135e-311
@@ -7035,9 +6883,9 @@ define double @v_mul_fabs_neg256_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_neg256_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0700000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0700000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_neg256_f64:
@@ -7049,9 +6897,7 @@ define double @v_mul_fabs_neg256_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_neg256_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0700000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_neg256_f64:
@@ -7063,9 +6909,7 @@ define double @v_mul_fabs_neg256_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_neg256_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0700000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -256.0
@@ -7082,9 +6926,9 @@ define double @v_mul_fabs_neg8_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_neg8_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0200000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0200000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_neg8_f64:
@@ -7096,9 +6940,7 @@ define double @v_mul_fabs_neg8_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_neg8_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0200000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_neg8_f64:
@@ -7110,9 +6952,7 @@ define double @v_mul_fabs_neg8_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_neg8_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0200000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -8.0
@@ -7203,9 +7043,9 @@ define double @v_mul_fabs_negquarter_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xbfd00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbfd00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_negquarter_f64:
@@ -7217,9 +7057,7 @@ define double @v_mul_fabs_negquarter_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xbfd00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_negquarter_f64:
@@ -7231,9 +7069,7 @@ define double @v_mul_fabs_negquarter_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xbfd00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -0.25
@@ -7250,9 +7086,9 @@ define double @v_mul_fabs_quarter_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_quarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x3fd00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fd00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_quarter_f64:
@@ -7264,9 +7100,7 @@ define double @v_mul_fabs_quarter_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_quarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x3fd00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_quarter_f64:
@@ -7278,9 +7112,7 @@ define double @v_mul_fabs_quarter_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_quarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x3fd00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 0.25
@@ -7371,9 +7203,9 @@ define double @v_mul_fabs_8_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_8_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40200000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_8_f64:
@@ -7385,9 +7217,7 @@ define double @v_mul_fabs_8_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_8_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_8_f64:
@@ -7399,9 +7229,7 @@ define double @v_mul_fabs_8_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_8_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40200000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 8.0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index d4c830c55030d6c..b60780db77378c7 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -435,21 +435,17 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40220000
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
 ; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40220000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
@@ -457,7 +453,7 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i64 1, %cnt
   %conv = uitofp i64 %shl to double
@@ -529,21 +525,17 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0xc0220000
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
 ; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0xc0220000, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0xc0220000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
@@ -551,7 +543,7 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0xc0220000, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i64 2, %cnt
   %conv = uitofp i64 %shl to double
@@ -721,8 +713,7 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x2000
-; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
@@ -740,23 +731,22 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0x2000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
-; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl8 = shl nuw i64 8, %cnt
@@ -838,24 +828,20 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40080000
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_mul_max_pow2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40080000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl2 = shl nuw i16 2, %cnt
   %shl1 = shl nuw i16 1, %cnt
@@ -925,21 +911,17 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40220000
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
 ; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40220000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
@@ -947,7 +929,7 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i64 %v, %cnt
   %conv = uitofp i64 %shl to double
@@ -1206,8 +1188,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x402e0000
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
@@ -1216,8 +1196,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec:
@@ -1225,11 +1205,10 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x402e0000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -1239,8 +1218,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
@@ -1435,20 +1414,21 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
-; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
-; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
-; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
-; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v1
+; VI-NEXT:    v_lshlrev_b64 v[1:2], v2, 2
 ; VI-NEXT:    s_mov_b32 s5, 0x402e0000
-; VI-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
-; VI-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
-; VI-NEXT:    v_cvt_f64_u32_e32 v[7:8], v2
+; VI-NEXT:    v_cvt_f64_u32_e32 v[5:6], v2
+; VI-NEXT:    v_ldexp_f64 v[2:3], v[3:4], 32
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[5:6], 32
+; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v0
+; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v1
+; VI-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
+; VI-NEXT:    v_add_f64 v[4:5], v[4:5], v[0:1]
+; VI-NEXT:    v_mul_f64 v[0:1], v[2:3], s[4:5]
 ; VI-NEXT:    s_mov_b32 s4, 0
-; VI-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
-; VI-NEXT:    v_add_f64 v[2:3], v[5:6], v[7:8]
-; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
 ; VI-NEXT:    s_mov_b32 s5, 0x402c0000
-; VI-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
@@ -1456,8 +1436,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
-; GFX10-NEXT:    s_mov_b32 s5, 0x402e0000
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
@@ -1466,9 +1444,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-NEXT:    s_mov_b32 s5, 0x402c0000
-; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402c0000, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
@@ -1476,11 +1453,10 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
-; GFX11-NEXT:    s_mov_b32 s1, 0x402e0000
-; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -1489,11 +1465,9 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s1, 0x402c0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402c0000, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
@@ -1580,8 +1554,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x402e0000
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
@@ -1590,8 +1562,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
@@ -1599,11 +1571,10 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x402e0000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -1613,8 +1584,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 1>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
diff --git a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
index 6e975c8a5370758..69c6a858162f39d 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
@@ -84,6 +84,9 @@ body:             |
     SI_RETURN_TO_EPILOG %2
 ...
 
+# FIXME: This could be folded, but we do not know if operand of S_AND_B64 is signed or unsigned
+#        and if it will be sign or zero extended.
+
 ---
 name:            fold_uint_32bit_literal_sgpr
 tracksRegLiveness: true
@@ -92,7 +95,8 @@ body:             |
 
     ; GCN-LABEL: name: fold_uint_32bit_literal_sgpr
     ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-    ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], 4294967295, implicit-def $scc
+    ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4294967295
+    ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], [[S_MOV_B64_]], implicit-def $scc
     ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_AND_B64_]]
     %0:sreg_64 = IMPLICIT_DEF
     %1:sreg_64 = S_MOV_B64 4294967295
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 58b0a0f56918b0c..fdc8c908ded549c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1561,12 +1561,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1579,12 +1579,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    s_endpgm
@@ -1592,12 +1592,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1637,12 +1637,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1838,12 +1838,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 055cfbdcc1ea397..3a0b8259d084965 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1821,14 +1821,14 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX6-NEXT:    v_add_f64 v[4:5], v[0:1], -v[4:5]
 ; GFX6-NEXT:    s_mov_b32 s9, 0x3fefffff
 ; GFX6-NEXT:    v_add_f64 v[6:7], v[0:1], -v[4:5]
-; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_min_f64 v[6:7], v[6:7], s[8:9]
 ; GFX6-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX6-NEXT:    v_min_f64 v[6:7], v[6:7], s[8:9]
+; GFX6-NEXT:    s_mov_b32 s8, 0
 ; GFX6-NEXT:    s_mov_b32 s9, 0x7ff00000
-; GFX6-NEXT:    s_mov_b32 s8, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
 ; GFX6-NEXT:    v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[8:9]
+; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
@@ -1841,13 +1841,14 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX7-LABEL: safe_math_fract_f64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s4, 0
 ; GFX7-NEXT:    s_mov_b32 s5, 0x7ff00000
-; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
 ; GFX7-NEXT:    v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5]
 ; GFX7-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
@@ -1872,10 +1873,8 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX11-LABEL: safe_math_fract_f64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x7ff00000
 ; GFX11-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_neq_f64_e64 vcc_lo, |v[0:1]|, s[0:1]
+; GFX11-NEXT:    v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
 ; GFX11-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 8bb8f6c464cd022..196a3705ac81876 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s
 
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
 
 define double @v_sqrt_f64(double %x) {
 ; SDAG-LABEL: v_sqrt_f64:
@@ -37,11 +37,11 @@ define double @v_sqrt_f64(double %x) {
 ; GISEL-LABEL: v_sqrt_f64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -97,11 +97,11 @@ define double @v_sqrt_f64_fneg(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_fneg:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -158,11 +158,11 @@ define double @v_sqrt_f64_fabs(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_fabs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -219,11 +219,11 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_fneg_fabs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -281,11 +281,11 @@ define double @v_sqrt_f64_ninf(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -341,11 +341,11 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 ; GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -401,11 +401,11 @@ define double @v_sqrt_f64_nnan(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_nnan:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -461,10 +461,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 ;
 ; GISEL-LABEL: s_sqrt_f64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b32 s2, 0
-; GISEL-NEXT:    s_brev_b32 s3, 8
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -532,10 +530,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 ;
 ; GISEL-LABEL: s_sqrt_f64_ninf:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b32 s2, 0
-; GISEL-NEXT:    s_brev_b32 s3, 8
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -603,10 +599,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 ;
 ; GISEL-LABEL: s_sqrt_f64_afn:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b32 s2, 0
-; GISEL-NEXT:    s_brev_b32 s3, 8
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -674,10 +668,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 ;
 ; GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b32 s2, 0
-; GISEL-NEXT:    s_brev_b32 s3, 8
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -745,11 +737,11 @@ define double @v_sqrt_f64_nsz(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -805,11 +797,11 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_nnan_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -865,11 +857,11 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -925,11 +917,11 @@ define double @v_sqrt_f64_afn(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_afn:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -985,11 +977,11 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_afn_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1066,12 +1058,14 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1142,11 +1136,11 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1202,11 +1196,11 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1263,11 +1257,11 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1344,12 +1338,14 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1420,11 +1416,11 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1480,11 +1476,11 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1540,11 +1536,11 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1600,11 +1596,11 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; GISEL-LABEL: v_sqrt_f64__unsafe_attr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1681,12 +1677,14 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1795,17 +1793,19 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; GISEL-LABEL: v_sqrt_v3f64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s6, 0
-; GISEL-NEXT:    s_brev_b32 s7, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
-; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
-; GISEL-NEXT:    v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[6:7]
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v7
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v8, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, v8, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[6:7]
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v9
 ; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
@@ -1870,5 +1870,3 @@ attributes #1 = { convergent nounwind willreturn memory(none) }
 attributes #2 = { "approx-func-fp-math"="true" }
 attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
 attributes #4 = { "unsafe-fp-math"="true" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 08e06d4dd015a55..d85778bc0195fbb 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -53,16 +53,16 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_add_i32_max_neg_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, 0xfffff000
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
@@ -5403,12 +5403,12 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -6148,12 +6148,12 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 glc
@@ -6339,12 +6339,12 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc
@@ -6606,16 +6606,16 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_inc_i32_max_neg_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, 0xfffff000
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
@@ -7001,16 +7001,16 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_dec_i32_max_neg_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, 0xfffff000
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 429bdd805ec5e11..4cbd5e84871cc75 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1096,8 +1096,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -1141,27 +1141,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX9-NEXT:  .LBB2_3:
@@ -1169,25 +1170,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -1226,9 +1225,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -1255,8 +1253,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -1266,10 +1263,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -1309,10 +1305,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -1350,27 +1345,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX9-DPP-NEXT:  .LBB2_3:
@@ -1378,25 +1374,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -1435,9 +1429,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -1464,8 +1457,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -1475,10 +1467,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -1518,10 +1509,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -2356,8 +2346,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -2401,27 +2391,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX9-NEXT:  .LBB4_3:
@@ -2429,25 +2420,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -2486,9 +2475,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -2515,8 +2503,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -2526,10 +2513,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -2569,10 +2555,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -2610,27 +2595,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX9-DPP-NEXT:  .LBB4_3:
@@ -2638,25 +2624,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -2695,9 +2679,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -2724,8 +2707,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -2735,10 +2717,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -2778,10 +2759,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -4320,8 +4300,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -4365,27 +4345,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-NEXT:  .LBB7_3:
@@ -4393,25 +4374,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -4450,9 +4429,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -4479,8 +4457,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -4490,10 +4467,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -4533,10 +4509,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -4574,27 +4549,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-DPP-NEXT:  .LBB7_3:
@@ -4602,25 +4578,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -4659,9 +4633,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -4688,8 +4661,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -4699,10 +4671,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -4742,10 +4713,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index f05a420a1b0a269..4a00d7bc71bca8a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1192,8 +1192,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -1237,27 +1237,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX9-NEXT:  .LBB2_3:
@@ -1265,25 +1266,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -1322,9 +1321,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -1351,8 +1349,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -1362,10 +1359,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -1405,10 +1401,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -1446,27 +1441,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX9-DPP-NEXT:  .LBB2_3:
@@ -1474,25 +1470,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -1531,9 +1525,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -1560,8 +1553,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -1571,10 +1563,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -1614,10 +1605,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -2452,8 +2442,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -2497,27 +2487,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX9-NEXT:  .LBB4_3:
@@ -2525,25 +2516,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -2582,9 +2571,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -2611,8 +2599,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -2622,10 +2609,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -2665,10 +2651,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -2706,27 +2691,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX9-DPP-NEXT:  .LBB4_3:
@@ -2734,25 +2720,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -2791,9 +2775,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -2820,8 +2803,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -2831,10 +2813,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -2874,10 +2855,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -4504,8 +4484,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -4549,27 +4529,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-NEXT:  .LBB7_3:
@@ -4577,25 +4558,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -4634,9 +4613,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -4663,8 +4641,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -4674,10 +4651,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -4717,10 +4693,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -4758,27 +4733,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-DPP-NEXT:  .LBB7_3:
@@ -4786,25 +4762,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -4843,9 +4817,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -4872,8 +4845,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -4883,10 +4855,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -4926,10 +4897,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index 173e4f656c90deb..a290dd5fd145c93 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -205,9 +205,8 @@ entry:
 
 ; FIXME: Should not have intermediate sgprs
 ; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
-; CHECK: s_mov_b64 s[0:1], 0x1e240
-; CHECK: v_mov_b32_e32 v0, s0
-; CHECK: v_mov_b32_e32 v1, s1
+; CHECK: v_mov_b32_e32 v0, 0x1e240
+; CHECK: v_mov_b32_e32 v1, 0
 ; CHECK: use v[0:1]
 define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index f5d41b246b1b8c0..220ea962b9e1dca 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -85,17 +85,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
 ; GFX11-NEXT:    s_mov_b32 s13, s14
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s21, s14
+; GFX11-NEXT:    s_mov_b32 s3, s14
 ; GFX11-NEXT:    s_mov_b32 s14, s15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s14, s21
+; GFX11-NEXT:    s_mov_b32 s14, s3
 ; GFX11-NEXT:    s_mov_b32 s1, -1
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT:    s_cbranch_vccz .LBB2_4
+; GFX11-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX11-NEXT:    s_branch .LBB2_12
 ; GFX11-NEXT:  .LBB2_3:
 ; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB2_12
 ; GFX11-NEXT:  .LBB2_4: ; %bb16
 ; GFX11-NEXT:    s_load_b32 s2, s[16:17], 0x54
 ; GFX11-NEXT:    s_bitcmp1_b32 s23, 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index ae470efc92feee4..3bc503e3714fe5b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -289,16 +289,15 @@ entry:
 define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s7, s[0:1], 0x34
+; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
-; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s7, s7, 4
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s7
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_lshl_b32 s6, s6, 4
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -419,16 +418,15 @@ entry:
 define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s7, s[0:1], 0x34
+; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
-; GCN-NEXT:    s_mov_b32 s6, 0x10001
+; GCN-NEXT:    s_mov_b32 s4, 0x10001
+; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s7, s7, 4
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s7
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_lshl_b32 s6, s6, 4
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -445,12 +443,11 @@ entry:
 define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b64 s[4:5], 0xff
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s6, s6, 3
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
+; GCN-NEXT:    s_lshl_b32 s4, s4, 3
+; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
 ; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
 ; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
 ; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index c58dbd6bd12069c..68427e8937bb946 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1550,10 +1550,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_lshl_b32 s0, s8, 4
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b32 s8, s8, 4
-; SI-NEXT:    s_mov_b64 s[0:1], 0xffff
-; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
+; SI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
 ; SI-NEXT:    s_and_b32 s9, s1, 0x50005
 ; SI-NEXT:    s_and_b32 s8, s0, 0x50005
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
@@ -1572,11 +1571,10 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b32 s8, s8, 4
-; VI-NEXT:    s_mov_b64 s[0:1], 0xffff
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
+; VI-NEXT:    s_lshl_b32 s0, s8, 4
 ; VI-NEXT:    s_mov_b32 s8, 0x50005
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
 ; VI-NEXT:    s_mov_b32 s9, s8
 ; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
@@ -1725,17 +1723,16 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_lshl_b32 s0, s8, 3
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; SI-NEXT:    s_lshl_b32 s8, s8, 3
-; SI-NEXT:    s_mov_b64 s[2:3], 0xff
-; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; SI-NEXT:    s_and_b32 s9, s3, 0x5050505
-; SI-NEXT:    s_and_b32 s8, s2, 0x5050505
+; SI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
+; SI-NEXT:    s_and_b32 s9, s1, 0x5050505
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT:    s_and_b32 s8, s0, 0x5050505
+; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1748,17 +1745,16 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_lshl_b32 s0, s8, 3
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; VI-NEXT:    s_lshl_b32 s8, s8, 3
-; VI-NEXT:    s_mov_b64 s[2:3], 0xff
-; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; VI-NEXT:    s_and_b32 s9, s3, 0x5050505
-; VI-NEXT:    s_and_b32 s8, s2, 0x5050505
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
+; VI-NEXT:    s_and_b32 s9, s1, 0x5050505
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; VI-NEXT:    s_and_b32 s8, s0, 0x5050505
+; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index f98b41ba199bd7f..47f7943e076a4a0 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2064,19 +2064,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ;
 ; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x10
 ; GFX11-NEXT:    global_load_b32 v2, v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0xffff
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[6:7]
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s2, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 0xffff
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v1, v3, s0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -2106,12 +2104,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0xffff
-; GFX9-NEXT:    s_lshl_b32 s4, s7, 4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s6, s6
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-NEXT:    s_lshl_b32 s2, s7, 4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s6, s6
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_bfi_b32 v1, s3, v3, v1
 ; GFX9-NEXT:    v_bfi_b32 v0, s2, v4, v0
@@ -2128,14 +2125,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_lshl_b32 s1, s5, 4
-; VI-NEXT:    s_lshl_b32 s5, s4, 16
-; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_lshl_b32 s1, s4, 16
+; VI-NEXT:    s_and_b32 s2, s4, 0xffff
+; VI-NEXT:    s_lshl_b32 s3, s5, 4
+; VI-NEXT:    s_or_b32 s2, s2, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
-; VI-NEXT:    s_or_b32 s2, s4, s5
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s3
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    v_mov_b32_e32 v5, s2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2155,14 +2151,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_and_b32 s6, s4, 0xffff
-; CI-NEXT:    s_lshl_b32 s1, s5, 4
-; CI-NEXT:    s_lshl_b32 s4, s4, 16
+; CI-NEXT:    s_and_b32 s1, s4, 0xffff
+; CI-NEXT:    s_lshl_b32 s2, s4, 16
+; CI-NEXT:    s_lshl_b32 s3, s5, 4
+; CI-NEXT:    s_or_b32 s2, s1, s2
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
-; CI-NEXT:    s_or_b32 s2, s6, s4
+; CI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s3
 ; CI-NEXT:    v_mov_b32_e32 v4, s2
 ; CI-NEXT:    v_mov_b32_e32 v5, s2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2177,15 +2172,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x10
-; GFX11-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s0, s0
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
+; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s0, s0
+; GFX11-NEXT:    s_lshl_b64 s[0:1], 0xffff, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v1, s1, s6, v1
-; GFX11-NEXT:    v_bfi_b32 v0, s0, s6, v0
+; GFX11-NEXT:    v_bfi_b32 v1, s1, s2, v1
+; GFX11-NEXT:    v_bfi_b32 v0, s0, s2, v0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
index 741164bc045062e..29a96c227f2f0a1 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
@@ -29,7 +29,7 @@ declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1
 define internal fastcc void @svm_node_closure_bsdf(ptr addrspace(1) %sd, ptr %stack, <4 x i32> %node, ptr %offset, i32 %0, i8 %trunc, float %1, float %2, float %mul80, i1 %cmp412.old, <4 x i32> %3, float %4, i32 %5, i1 %cmp440, i1 %cmp442, i1 %or.cond1306, float %.op, ptr addrspace(1) %arrayidx.i.i2202, ptr addrspace(1) %retval.0.i.i22089, ptr addrspace(1) %retval.1.i221310, i1 %cmp575, ptr addrspace(1) %num_closure_left.i2215, i32 %6, i1 %cmp.i2216, i32 %7, i64 %idx.ext.i2223, i32 %sub5.i2221) #2 {
 ; GCN-LABEL: {{^}}svm_node_closure_bsdf:
 ; GCN-NOT: v_writelane_b32
-; GCN: s_movk_i32 s28, 0x60
+; GCN: s_movk_i32 s26, 0x60
 ; GCN-NOT: s31
 ; GCN-NOT: v_readlane_b32
 ; GCN: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
index dee5b724934a0b7..31295f2a543f2d4 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
@@ -674,98 +674,91 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
 define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
 ; SI-LABEL: lds_ds_fmin_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s11, 0xe8f000
-; SI-NEXT:    s_add_u32 s8, s8, s3
-; SI-NEXT:    s_addc_u32 s9, s9, 0
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s7, 0xe8f000
+; SI-NEXT:    s_add_u32 s4, s4, s3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s5, s4, 4
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_mov_b32 s3, 0x40450000
-; SI-NEXT:    s_add_i32 s4, s4, 32
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v2, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_lshl_b32 s3, s2, 4
+; SI-NEXT:    s_lshl_b32 s2, s2, 3
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s2, s2, 32
+; SI-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; SI-NEXT:    v_mov_b32_e32 v2, s2
 ; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; SI-NEXT:    s_add_i32 s2, s5, 64
-; SI-NEXT:    v_mov_b32_e32 v4, s2
-; SI-NEXT:    ds_min_f64 v4, v[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_add_i32 s1, s3, 64
+; SI-NEXT:    v_mov_b32_e32 v5, s1
+; SI-NEXT:    ds_min_f64 v5, v[0:1]
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
+; SI-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
 ; SI-NEXT:    s_add_i32 s1, s0, 4
-; SI-NEXT:    v_mov_b32_e32 v2, s1
+; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
+; SI-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: lds_ds_fmin_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_mov_b32 s10, -1
-; GFX7-NEXT:    s_load_dword s4, s[0:1], 0xb
+; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s11, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s8, s8, s3
-; GFX7-NEXT:    s_mov_b32 s2, 0
-; GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
+; GFX7-NEXT:    s_add_u32 s4, s4, s3
+; GFX7-NEXT:    s_addc_u32 s5, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s2, s4, 3
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s4, 4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s2
-; GFX7-NEXT:    ds_min_f64 v4, v[0:1] offset:64
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
+; GFX7-NEXT:    v_mov_b32_e32 v5, s2
+; GFX7-NEXT:    v_mov_b32_e32 v4, s1
+; GFX7-NEXT:    ds_min_f64 v5, v[0:1] offset:64
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
+; GFX7-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
 ; GFX7-NEXT:    s_add_i32 s1, s0, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; GFX7-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
+; GFX7-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
+; GFX7-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
 ; GFX7-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_ds_fmin_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s90, -1
 ; VI-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_mov_b32 s2, 0
-; VI-NEXT:    s_mov_b32 s3, 0x40450000
-; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s2, s4, 3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_lshl_b32 s3, s2, 3
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT:    s_lshl_b32 s2, s4, 4
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    ds_min_f64 v4, v[0:1] offset:64
-; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    s_lshl_b32 s2, s2, 4
+; VI-NEXT:    v_mov_b32_e32 v5, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    ds_min_f64 v5, v[0:1] offset:64
 ; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
+; VI-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
 ; VI-NEXT:    s_add_i32 s1, s0, 4
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -783,11 +776,9 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX9-NEXT:    s_add_u32 s8, s8, s3
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
@@ -814,17 +805,15 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    s_mov_b32 s1, 0x40450000
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s5, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX10-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
 ; GFX10-NEXT:    ds_min_f64 v4, v[0:1] offset:64
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
@@ -838,16 +827,14 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX11-LABEL: lds_ds_fmin_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0x40450000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s5, s4, 3
-; GFX11-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v2, s5
-; GFX11-NEXT:    s_lshl_b32 s2, s4, 4
+; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
+; GFX11-NEXT:    v_mov_b32_e32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX11-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
@@ -870,11 +857,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_SI-NEXT:    s_mov_b32 s2, 0
 ; G_SI-NEXT:    s_addc_u32 s9, s9, 0
 ; G_SI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_SI-NEXT:    s_add_i32 s4, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_SI-NEXT:    s_lshl_b32 s2, s4, 3
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
 ; G_SI-NEXT:    v_mov_b32_e32 v2, s2
 ; G_SI-NEXT:    s_mov_b32 m0, -1
 ; G_SI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
@@ -904,11 +891,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX7-NEXT:    s_mov_b32 s2, 0
 ; G_GFX7-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX7-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 3
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; G_GFX7-NEXT:    s_mov_b32 m0, -1
 ; G_GFX7-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
@@ -938,11 +925,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_VI-NEXT:    s_mov_b32 s2, 0
 ; G_VI-NEXT:    s_addc_u32 s89, s89, 0
 ; G_VI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_VI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_VI-NEXT:    s_add_i32 s4, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
+; G_VI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_VI-NEXT:    s_lshl_b32 s2, s4, 3
+; G_VI-NEXT:    v_mov_b32_e32 v1, s3
 ; G_VI-NEXT:    v_mov_b32_e32 v2, s2
 ; G_VI-NEXT:    s_mov_b32 m0, -1
 ; G_VI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
@@ -972,11 +959,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX9-NEXT:    s_mov_b32 s0, 0
 ; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
+; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; G_GFX9-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
 ; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
@@ -1031,10 +1018,10 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX11-NEXT:    s_add_i32 s4, s2, 4
 ; G_GFX11-NEXT:    s_mov_b32 s2, 0
-; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
 ; G_GFX11-NEXT:    s_mov_b32 s3, 0x40450000
+; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
 ; G_GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
+; G_GFX11-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
 ; G_GFX11-NEXT:    s_lshl_b32 s2, s4, 4
 ; G_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; G_GFX11-NEXT:    v_mov_b32_e32 v4, s2
@@ -1060,98 +1047,91 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
 ; SI-LABEL: lds_ds_fmax_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s11, 0xe8f000
-; SI-NEXT:    s_add_u32 s8, s8, s3
-; SI-NEXT:    s_addc_u32 s9, s9, 0
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s7, 0xe8f000
+; SI-NEXT:    s_add_u32 s4, s4, s3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s5, s4, 4
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_mov_b32 s3, 0x40450000
-; SI-NEXT:    s_add_i32 s4, s4, 32
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v2, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_lshl_b32 s3, s2, 4
+; SI-NEXT:    s_lshl_b32 s2, s2, 3
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s2, s2, 32
+; SI-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; SI-NEXT:    v_mov_b32_e32 v2, s2
 ; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; SI-NEXT:    s_add_i32 s2, s5, 64
-; SI-NEXT:    v_mov_b32_e32 v4, s2
-; SI-NEXT:    ds_max_f64 v4, v[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_add_i32 s1, s3, 64
+; SI-NEXT:    v_mov_b32_e32 v5, s1
+; SI-NEXT:    ds_max_f64 v5, v[0:1]
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
+; SI-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
 ; SI-NEXT:    s_add_i32 s1, s0, 4
-; SI-NEXT:    v_mov_b32_e32 v2, s1
+; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
+; SI-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: lds_ds_fmax_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_mov_b32 s10, -1
-; GFX7-NEXT:    s_load_dword s4, s[0:1], 0xb
+; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s11, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s8, s8, s3
-; GFX7-NEXT:    s_mov_b32 s2, 0
-; GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
+; GFX7-NEXT:    s_add_u32 s4, s4, s3
+; GFX7-NEXT:    s_addc_u32 s5, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s2, s4, 3
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s4, 4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s2
-; GFX7-NEXT:    ds_max_f64 v4, v[0:1] offset:64
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
+; GFX7-NEXT:    v_mov_b32_e32 v5, s2
+; GFX7-NEXT:    v_mov_b32_e32 v4, s1
+; GFX7-NEXT:    ds_max_f64 v5, v[0:1] offset:64
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
+; GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
 ; GFX7-NEXT:    s_add_i32 s1, s0, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; GFX7-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
+; GFX7-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
+; GFX7-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
 ; GFX7-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_ds_fmax_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s90, -1
 ; VI-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_mov_b32 s2, 0
-; VI-NEXT:    s_mov_b32 s3, 0x40450000
-; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s2, s4, 3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_lshl_b32 s3, s2, 3
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT:    s_lshl_b32 s2, s4, 4
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    ds_max_f64 v4, v[0:1] offset:64
-; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    s_lshl_b32 s2, s2, 4
+; VI-NEXT:    v_mov_b32_e32 v5, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    ds_max_f64 v5, v[0:1] offset:64
 ; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
+; VI-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
 ; VI-NEXT:    s_add_i32 s1, s0, 4
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -1169,11 +1149,9 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX9-NEXT:    s_add_u32 s8, s8, s3
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
@@ -1200,17 +1178,15 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    s_mov_b32 s1, 0x40450000
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s5, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX10-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
 ; GFX10-NEXT:    ds_max_f64 v4, v[0:1] offset:64
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
@@ -1224,16 +1200,14 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX11-LABEL: lds_ds_fmax_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0x40450000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s5, s4, 3
-; GFX11-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v2, s5
-; GFX11-NEXT:    s_lshl_b32 s2, s4, 4
+; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
+; GFX11-NEXT:    v_mov_b32_e32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX11-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
@@ -1256,11 +1230,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_SI-NEXT:    s_mov_b32 s2, 0
 ; G_SI-NEXT:    s_addc_u32 s9, s9, 0
 ; G_SI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_SI-NEXT:    s_add_i32 s4, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_SI-NEXT:    s_lshl_b32 s2, s4, 3
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
 ; G_SI-NEXT:    v_mov_b32_e32 v2, s2
 ; G_SI-NEXT:    s_mov_b32 m0, -1
 ; G_SI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
@@ -1290,11 +1264,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX7-NEXT:    s_mov_b32 s2, 0
 ; G_GFX7-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX7-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 3
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; G_GFX7-NEXT:    s_mov_b32 m0, -1
 ; G_GFX7-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
@@ -1324,11 +1298,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_VI-NEXT:    s_mov_b32 s2, 0
 ; G_VI-NEXT:    s_addc_u32 s89, s89, 0
 ; G_VI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_VI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_VI-NEXT:    s_add_i32 s4, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
+; G_VI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_VI-NEXT:    s_lshl_b32 s2, s4, 3
+; G_VI-NEXT:    v_mov_b32_e32 v1, s3
 ; G_VI-NEXT:    v_mov_b32_e32 v2, s2
 ; G_VI-NEXT:    s_mov_b32 m0, -1
 ; G_VI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
@@ -1358,11 +1332,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX9-NEXT:    s_mov_b32 s0, 0
 ; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
+; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; G_GFX9-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
 ; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
@@ -1417,10 +1391,10 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX11-NEXT:    s_add_i32 s4, s2, 4
 ; G_GFX11-NEXT:    s_mov_b32 s2, 0
-; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
 ; G_GFX11-NEXT:    s_mov_b32 s3, 0x40450000
+; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
 ; G_GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
+; G_GFX11-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
 ; G_GFX11-NEXT:    s_lshl_b32 s2, s4, 4
 ; G_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; G_GFX11-NEXT:    v_mov_b32_e32 v4, s2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index 5a950d803e9c5d1..af59d62b2e2d059 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11,SDAG-GFX11 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,SDAG-GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX10 %s
 
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11,GISEL-GFX11 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GISEL-GFX10 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX10 %s
 
 declare i32 @llvm.amdgcn.fcmp.f32(float, float, i32) #0
 declare i32 @llvm.amdgcn.fcmp.f64(double, double, i32) #0
@@ -962,10 +962,8 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_oeq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_eq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -976,11 +974,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -988,11 +984,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_eq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1003,11 +997,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1020,10 +1012,8 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_one:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1034,11 +1024,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_one:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1046,11 +1034,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_one:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1061,11 +1047,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_one:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1078,10 +1062,8 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ogt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_gt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1092,11 +1074,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1104,11 +1084,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_gt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1119,11 +1097,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1136,10 +1112,8 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_oge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_ge_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1150,11 +1124,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_oge:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1162,11 +1134,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_oge:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_ge_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1177,11 +1147,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_oge:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1194,10 +1162,8 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_olt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_lt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1208,11 +1174,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_olt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1220,11 +1184,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_olt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_lt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1235,11 +1197,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_olt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1252,10 +1212,8 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ole:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_le_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1266,11 +1224,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ole:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1278,11 +1234,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ole:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_le_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1293,11 +1247,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ole:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1310,10 +1262,8 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ueq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1324,11 +1274,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1336,11 +1284,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1351,11 +1297,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1368,10 +1312,8 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_o:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_o_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1382,11 +1324,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_o:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_o_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1394,11 +1334,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_o:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_o_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1409,11 +1347,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_o:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_o_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1426,10 +1362,8 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_uo:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_u_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1440,11 +1374,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_uo:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_u_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1452,11 +1384,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_uo:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_u_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1467,11 +1397,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_uo:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_u_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1484,10 +1412,8 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_une:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1498,11 +1424,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_une:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1510,11 +1434,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_une:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1525,11 +1447,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_une:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1542,10 +1462,8 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ugt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_nle_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1556,11 +1474,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1568,11 +1484,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_nle_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1583,11 +1497,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1600,10 +1512,8 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_uge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1614,11 +1524,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_uge:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1626,11 +1534,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_uge:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1641,11 +1547,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_uge:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1658,10 +1562,8 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ult:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_nge_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1672,11 +1574,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ult:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1684,11 +1584,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ult:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_nge_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1699,11 +1597,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ult:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1716,10 +1612,8 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ule:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1730,11 +1624,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ule:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1742,11 +1634,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ule:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1757,11 +1647,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ule:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -2754,7 +2642,3 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
 }
 
 attributes #0 = { nounwind readnone convergent }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
-; GFX10: {{.*}}
-; GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index e2bdcfa6bbddc87..8c76df3e041fdf7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -1046,11 +1046,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_oeq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1059,32 +1057,18 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_oeq:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_oeq:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_oeq:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_oeq:
 ; VI-SDAG:       ; %bb.0:
@@ -1103,10 +1087,8 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_oeq:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1124,11 +1106,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_one:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1137,32 +1117,18 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_one:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_one:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_one:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_one:
 ; VI-SDAG:       ; %bb.0:
@@ -1181,10 +1147,8 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_one:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1202,11 +1166,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ogt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1215,32 +1177,18 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ogt:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ogt:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ogt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ogt:
 ; VI-SDAG:       ; %bb.0:
@@ -1259,10 +1207,8 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ogt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1280,11 +1226,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_oge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1293,32 +1237,18 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_oge:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_oge:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_oge:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_oge:
 ; VI-SDAG:       ; %bb.0:
@@ -1337,10 +1267,8 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_oge:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1358,11 +1286,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_olt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1371,32 +1297,18 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_olt:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_olt:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_olt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_olt:
 ; VI-SDAG:       ; %bb.0:
@@ -1415,10 +1327,8 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_olt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1436,11 +1346,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ole:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1449,32 +1357,18 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ole:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ole:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ole:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ole:
 ; VI-SDAG:       ; %bb.0:
@@ -1493,10 +1387,8 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ole:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1514,11 +1406,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ueq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1527,32 +1417,18 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ueq:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ueq:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ueq:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ueq:
 ; VI-SDAG:       ; %bb.0:
@@ -1571,10 +1447,8 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ueq:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1592,11 +1466,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_o:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1605,32 +1477,18 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_o:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_o:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_o:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_o:
 ; VI-SDAG:       ; %bb.0:
@@ -1649,10 +1507,8 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_o:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1670,11 +1526,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_uo:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1683,32 +1537,18 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_uo:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_uo:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_uo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_uo:
 ; VI-SDAG:       ; %bb.0:
@@ -1727,10 +1567,8 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_uo:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1748,11 +1586,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_une:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1761,32 +1597,18 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_une:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_une:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_une:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_une:
 ; VI-SDAG:       ; %bb.0:
@@ -1805,10 +1627,8 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_une:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1826,11 +1646,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ugt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1839,32 +1657,18 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ugt:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ugt:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ugt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ugt:
 ; VI-SDAG:       ; %bb.0:
@@ -1883,10 +1687,8 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ugt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1904,11 +1706,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_uge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1917,32 +1717,18 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_uge:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_uge:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_uge:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_uge:
 ; VI-SDAG:       ; %bb.0:
@@ -1961,10 +1747,8 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_uge:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1982,11 +1766,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ult:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1995,32 +1777,18 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ult:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ult:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ult:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ult:
 ; VI-SDAG:       ; %bb.0:
@@ -2039,10 +1807,8 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ult:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -2060,11 +1826,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ule:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -2073,32 +1837,18 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ule:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ule:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ule:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ule:
 ; VI-SDAG:       ; %bb.0:
@@ -2117,10 +1867,8 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ule:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index 44d1cfb96146eb7..7a492f51cce49f4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -633,9 +633,8 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_eq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_eq_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -646,10 +645,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_eq:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -657,10 +655,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_eq:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_eq_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -671,10 +668,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_eq:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -687,9 +683,8 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_ne:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_ne_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -700,10 +695,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_ne:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -711,10 +705,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_ne:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_ne_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -725,10 +718,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_ne:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -741,9 +733,8 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_ugt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -754,10 +745,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_u64_ugt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -765,10 +755,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_u64_ugt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -779,10 +768,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_u64_ugt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -795,9 +783,8 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_uge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_ge_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -808,10 +795,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_u64_uge:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -819,10 +805,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_u64_uge:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_ge_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -833,10 +818,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_u64_uge:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -849,9 +833,8 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_ult:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -862,10 +845,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_u64_ult:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -873,10 +855,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_u64_ult:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -887,10 +868,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_u64_ult:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -903,9 +883,8 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_ule:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_le_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -916,10 +895,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_u64_ule:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -927,10 +905,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_u64_ule:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_le_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -941,10 +918,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_u64_ule:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -957,9 +933,8 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_sgt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -970,10 +945,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_sgt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -981,10 +955,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_sgt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -995,10 +968,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_sgt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1011,9 +983,8 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_sge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_ge_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1024,10 +995,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_sge:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1035,10 +1005,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_sge:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_ge_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1049,10 +1018,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_sge:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1065,9 +1033,8 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_slt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1078,10 +1045,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_slt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1089,10 +1055,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_slt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1103,10 +1068,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_slt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1119,9 +1083,8 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_sle:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_le_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1132,10 +1095,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_sle:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1143,10 +1105,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_sle:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_le_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1157,10 +1118,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_sle:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 5d1aa7cbb9992bc..80e0202962462c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -689,10 +689,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_eq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -715,24 +714,24 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_eq:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_eq:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_eq:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -741,20 +740,6 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_eq:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -764,10 +749,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_ne:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -790,24 +774,24 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_ne:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_ne:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_ne:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -816,20 +800,6 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_ne:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -839,10 +809,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_ugt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -865,24 +834,24 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_u64_ugt:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_u64_ugt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ugt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -891,20 +860,6 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_u64_ugt:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -914,10 +869,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_uge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -940,24 +894,24 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_u64_uge:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_u64_uge:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_uge:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -966,20 +920,6 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_u64_uge:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -989,10 +929,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_ult:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1015,24 +954,24 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_u64_ult:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_u64_ult:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ult:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1041,20 +980,6 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_u64_ult:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1064,10 +989,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_ule:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1090,24 +1014,24 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_u64_ule:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_u64_ule:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ule:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1116,20 +1040,6 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_u64_ule:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1139,10 +1049,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_sgt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1165,24 +1074,24 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_sgt:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_sgt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sgt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1191,20 +1100,6 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_sgt:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1214,10 +1109,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_sge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1240,24 +1134,24 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_sge:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_sge:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sge:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1266,20 +1160,6 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_sge:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1289,10 +1169,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_slt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1315,24 +1194,24 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_slt:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_slt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_slt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1341,20 +1220,6 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_slt:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1364,10 +1229,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_sle:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1390,24 +1254,24 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_sle:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_sle:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sle:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1416,20 +1280,6 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_sle:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
   store i64 %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
index 2be94f4dbc6501f..6e6a3cbdd4887d4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -47,9 +47,7 @@ define amdgpu_kernel void @rsq_f64_constant_4.0(ptr addrspace(1) %out) #1 {
 }
 
 ; FUNC-LABEL: {{^}}rsq_f64_constant_100.0
-; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000
-; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 0x40590000
 define amdgpu_kernel void @rsq_f64_constant_100.0(ptr addrspace(1) %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0
   store double %rsq, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index 684edd27536b541..16b1ccf58cf6a51 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -898,12 +898,13 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
-; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
+; GFX6-SDAG-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f64_i32:
@@ -936,11 +937,11 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX6-GISEL-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
-; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[5:6]
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -950,16 +951,16 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 }
 
 define double @test_frexp_f64_i32_only_use_fract(double %a) {
-; GFX6-LABEL: test_frexp_f64_i32_only_use_fract:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 0x7ff00000
-; GFX6-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[0:1]
-; GFX6-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: test_frexp_f64_i32_only_use_fract:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX6-SDAG-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f64_i32_only_use_fract:
 ; GFX8:       ; %bb.0:
@@ -978,21 +979,32 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_fract:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[4:5], v[0:1]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
   %result.0 = extractvalue { double, i32 } %result, 0
   ret double %result.0
 }
 
 define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
-; GFX6-LABEL: test_frexp_f64_i32_only_use_exp:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 0x7ff00000
-; GFX6-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
-; GFX6-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: test_frexp_f64_i32_only_use_exp:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
+; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f64_i32_only_use_exp:
 ; GFX8:       ; %bb.0:
@@ -1011,6 +1023,16 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_exp:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
   %result.0 = extractvalue { double, i32 } %result, 1
   ret i32 %result.0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 438b1bfe319a04e..2a1488652d887a4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -4558,10 +4558,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_mov_b32_e32 v17, 0
-; GFX8-NEXT:    v_mov_b32_e32 v21, 0
-; GFX8-NEXT:    v_mov_b32_e32 v19, v17
-; GFX8-NEXT:    v_mov_b32_e32 v13, v17
+; GFX8-NEXT:    v_mov_b32_e32 v20, 0
+; GFX8-NEXT:    v_mov_b32_e32 v19, 0
+; GFX8-NEXT:    v_mov_b32_e32 v17, v20
+; GFX8-NEXT:    v_mov_b32_e32 v22, v20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -4571,61 +4571,62 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_add_u32 s4, s0, 0x50
 ; GFX8-NEXT:    s_addc_u32 s5, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v24, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v23, s2
-; GFX8-NEXT:    s_add_u32 s2, s0, 64
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v9, v17
-; GFX8-NEXT:    v_mov_b32_e32 v5, v17
-; GFX8-NEXT:    v_mov_b32_e32 v22, 0
+; GFX8-NEXT:    v_mov_b32_e32 v23, v20
+; GFX8-NEXT:    v_mov_b32_e32 v13, v20
+; GFX8-NEXT:    v_mov_b32_e32 v9, v20
+; GFX8-NEXT:    v_mov_b32_e32 v5, v20
+; GFX8-NEXT:    v_mov_b32_e32 v25, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 10, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v4
+; GFX8-NEXT:    v_and_b32_e32 v16, 1, v4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 11, v2
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v4
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff, v4
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 14, v2
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[18:21]
+; GFX8-NEXT:    v_mov_b32_e32 v17, s3
+; GFX8-NEXT:    v_mov_b32_e32 v16, s2
+; GFX8-NEXT:    s_add_u32 s2, s0, 64
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 1
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v4
-; GFX8-NEXT:    v_lshrrev_b16_e32 v18, 15, v2
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[16:19]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s3
-; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v19, 1, v4
+; GFX8-NEXT:    v_lshrrev_b16_e32 v21, 15, v2
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[19:22]
+; GFX8-NEXT:    v_mov_b32_e32 v17, s3
+; GFX8-NEXT:    v_and_b32_sdwa v19, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v23, s2
+; GFX8-NEXT:    v_mov_b32_e32 v16, s2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x60
-; GFX8-NEXT:    v_mov_b32_e32 v19, 0
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff, v0
+; GFX8-NEXT:    v_mov_b32_e32 v22, 0
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v0
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[16:19]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s3
-; GFX8-NEXT:    v_mov_b32_e32 v23, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[19:22]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v20
+; GFX8-NEXT:    v_mov_b32_e32 v19, s3
+; GFX8-NEXT:    v_mov_b32_e32 v18, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v26, s3
+; GFX8-NEXT:    v_mov_b32_e32 v21, s3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 12, v2
-; GFX8-NEXT:    v_mov_b32_e32 v25, s2
+; GFX8-NEXT:    v_mov_b32_e32 v20, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 13, v2
-; GFX8-NEXT:    v_mov_b32_e32 v20, v17
-; GFX8-NEXT:    v_mov_b32_e32 v1, v17
 ; GFX8-NEXT:    v_mov_b32_e32 v17, s1
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s0
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v0
 ; GFX8-NEXT:    s_add_u32 s0, s0, 16
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 6, v2
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[19:22]
+; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[22:25]
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 2, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 4, v2
@@ -4634,23 +4635,23 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v14, 3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v24, 1, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v19, s3
-; GFX8-NEXT:    v_mov_b32_e32 v21, s1
+; GFX8-NEXT:    v_mov_b32_e32 v23, s1
 ; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX8-NEXT:    v_and_b32_e32 v6, 1, v14
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v22
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v24
 ; GFX8-NEXT:    v_mov_b32_e32 v18, s2
-; GFX8-NEXT:    v_mov_b32_e32 v20, s0
+; GFX8-NEXT:    v_mov_b32_e32 v22, s0
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX8-NEXT:    flat_store_dwordx4 v[25:26], v[12:15]
+; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
 ; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[4:7]
 ; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index e89c44d5b94a899..a078d89acf382b5 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3583,190 +3583,191 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x60
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x70
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s6
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[26:27]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    s_add_u32 s2, s2, 64
-; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    s_add_u32 s8, s2, 48
+; GCN-HSA-NEXT:    s_addc_u32 s9, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s9
+; GCN-HSA-NEXT:    s_add_u32 s10, s2, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s8
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[16:17]
+; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-HSA-NEXT:    s_add_u32 s10, s2, 0x50
+; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT:    s_add_u32 s10, s2, 0x60
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[4:5]
+; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-HSA-NEXT:    s_add_u32 s2, s2, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s4
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[12:13]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s6
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[12:13]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[14:15]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s1
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v37, 16, v21
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
-; GCN-HSA-NEXT:    v_and_b32_e32 v36, 0xffff, v21
-; GCN-HSA-NEXT:    v_and_b32_e32 v34, 0xffff, v20
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[34:37]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GCN-HSA-NEXT:    v_and_b32_e32 v34, 0xffff, v25
+; GCN-HSA-NEXT:    v_and_b32_e32 v32, 0xffff, v24
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s4
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xc0
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xf0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[32:35]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xd0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0xa0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GCN-HSA-NEXT:    v_and_b32_e32 v34, 0xffff, v23
-; GCN-HSA-NEXT:    v_and_b32_e32 v32, 0xffff, v22
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[32:35]
+; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x80
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v35, 16, v27
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GCN-HSA-NEXT:    v_and_b32_e32 v34, 0xffff, v27
+; GCN-HSA-NEXT:    v_and_b32_e32 v32, 0xffff, v26
+; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[32:35]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s10
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s11
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v11
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v10
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[24:27]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v23, 16, v17
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GCN-HSA-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GCN-HSA-NEXT:    v_and_b32_e32 v20, 0xffff, v16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v22, 16, v19
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v18
-; GCN-HSA-NEXT:    v_and_b32_e32 v21, 0xffff, v19
-; GCN-HSA-NEXT:    v_and_b32_e32 v19, 0xffff, v18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s5
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s4
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[19:22]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s6
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v14
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[15:18]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v8
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v10
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[16:19]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xffff, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[11:14]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xffff, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[8:11]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
+; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v28
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v28
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v29
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v31
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v30
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v31
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v30
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[15:18]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v30
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v29
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v28
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v3
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v22, 16, v27
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v30
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v29
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v28
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v28, 16, v24
-; GCN-HSA-NEXT:    v_and_b32_e32 v21, 0xffff, v27
-; GCN-HSA-NEXT:    v_and_b32_e32 v29, 0xffff, v25
-; GCN-HSA-NEXT:    v_and_b32_e32 v27, 0xffff, v24
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[27:30]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v26
-; GCN-HSA-NEXT:    v_and_b32_e32 v19, 0xffff, v26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v21
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v23
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v22
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v18
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v17
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v16
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v31
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[15:18]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v14
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32:
@@ -4383,189 +4384,191 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x70
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x60
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    s_add_u32 s8, s2, 64
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 64
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    s_addc_u32 s9, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s7
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s6
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[24:25]
-; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s4
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[8:9]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v35, 16, v21
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v33, 16, v20
-; GCN-HSA-NEXT:    v_bfe_i32 v34, v21, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v32, v20, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v35, 16, v29
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v33, 16, v28
+; GCN-HSA-NEXT:    v_bfe_i32 v34, v29, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v32, v28, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[32:35]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v34, 16, v31
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v32, 16, v30
+; GCN-HSA-NEXT:    v_bfe_i32 v33, v31, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v31, v30, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v35, 16, v23
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v33, 16, v22
-; GCN-HSA-NEXT:    v_bfe_i32 v34, v23, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v32, v22, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[32:35]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s5
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[31:34]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v21
+; GCN-HSA-NEXT:    v_bfe_i32 v30, v21, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v28, v20, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[35:36], v[28:31]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s4
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v17
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v16
-; GCN-HSA-NEXT:    v_bfe_i32 v22, v17, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v20, v16, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[20:23]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 16, v19
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 16, v18
-; GCN-HSA-NEXT:    v_bfe_i32 v21, v19, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v19, v18, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v23
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v22
+; GCN-HSA-NEXT:    v_bfe_i32 v30, v23, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v28, v22, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[19:22]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 16, v13
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 16, v12
-; GCN-HSA-NEXT:    v_bfe_i32 v18, v13, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v16, v12, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v39, s3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v15
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v14
-; GCN-HSA-NEXT:    v_bfe_i32 v22, v15, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v20, v14, 0, 16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v13
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v12
+; GCN-HSA-NEXT:    v_bfe_i32 v22, v13, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v20, v12, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v38, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[16:19]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[20:23]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v9
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v8
-; GCN-HSA-NEXT:    v_bfe_i32 v17, v9, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v15, v8, 0, 16
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v11
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v10
-; GCN-HSA-NEXT:    v_bfe_i32 v13, v11, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v11, v10, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[15:18]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[11:14]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v1
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v0
-; GCN-HSA-NEXT:    v_bfe_i32 v17, v1, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v15, v0, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v15
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v14
+; GCN-HSA-NEXT:    v_bfe_i32 v30, v15, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v28, v14, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[20:23]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v5
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v4
+; GCN-HSA-NEXT:    v_bfe_i32 v14, v5, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v12, v4, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v7
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v6
+; GCN-HSA-NEXT:    v_bfe_i32 v22, v7, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v20, v6, 0, 16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 16, v6
-; GCN-HSA-NEXT:    v_bfe_i32 v9, v7, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v7, v6, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v5
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v4
-; GCN-HSA-NEXT:    v_bfe_i32 v13, v5, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v11, v4, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v6, 16, v3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 16, v2
-; GCN-HSA-NEXT:    v_bfe_i32 v5, v3, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v3, v2, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v2
+; GCN-HSA-NEXT:    v_bfe_i32 v22, v3, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v20, v2, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 16, v28
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v8
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v9, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v8, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT:    v_bfe_i32 v19, v28, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v27
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v26
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v27, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v26, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v28, 16, v25
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v26, 16, v24
-; GCN-HSA-NEXT:    v_bfe_i32 v27, v25, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v25, v24, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[25:28]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v24
+; GCN-HSA-NEXT:    v_bfe_i32 v12, v24, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v24, 16, v11
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 16, v10
+; GCN-HSA-NEXT:    v_bfe_i32 v23, v11, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v21, v10, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 16, v29
-; GCN-HSA-NEXT:    v_bfe_i32 v21, v29, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[21:24]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v19, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v18, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 16, v17
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v16
+; GCN-HSA-NEXT:    v_bfe_i32 v19, v17, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v17, v16, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v31
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v30
-; GCN-HSA-NEXT:    v_bfe_i32 v17, v31, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v15, v30, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v25
+; GCN-HSA-NEXT:    v_bfe_i32 v14, v25, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v27
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v26
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v27, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v26, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32:
@@ -6526,16 +6529,16 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -6543,65 +6546,65 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
 ; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 0x60
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v3
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[7:10]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[2:5]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[7:8], v[14:17]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v15
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[2:5]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[10:13]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[6:9]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64:
@@ -7358,26 +7361,27 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, 0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[5:8], v[4:5]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 48
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[10:13], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[14:17], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[9:12], v[9:10]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[13:16], v[13:14]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
@@ -7392,124 +7396,123 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s14
 ; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v4
 ; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, v4
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s14
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v3
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s10
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s14
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s10
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v9
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s12
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v7
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s4
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v11
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s6
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v8
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s9
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v15
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s12
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v6
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s1
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
 ; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v10
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s6
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s9
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s8
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v14
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[17:20]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v16
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s4
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[9:12]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v23, 0xffff, v14
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v9
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v28, 16, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[23:26]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[26:29]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v21, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v19, 0xffff, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v24, 16, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v22, 0xffff, v7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[19:22]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[22:25]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[18:21]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[15:18]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[5:8]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[14:17]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64:
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index ea1e784fe58e2e8..dcaf7664d5b5aa0 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -45,14 +45,14 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_11bit_max:
 ; GFX11:       ; %bb.0:
@@ -60,18 +60,6 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x7ff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 2047
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -85,14 +73,14 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_12bit_max:
 ; GFX11:       ; %bb.0:
@@ -100,18 +88,6 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0xfff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -127,14 +103,14 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX11-SDAG:       ; %bb.0:
@@ -148,36 +124,17 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x1fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -187,610 +144,320 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 }
 
 define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 -2048
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
+define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
+; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 -4096
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
+define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
+; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 -8192
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
+define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
+; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 4095
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
+define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
+; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xf800
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xf800
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xf800
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 -2048
+  %gep = getelementptr i8, ptr %p, i64 8191
   %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_neg_12bit_max:
+define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
+; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
+; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xf000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 -4096
-  %load = load i8, ptr %gep, align 4
-  ret i8 %load
-}
-
-define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xe000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 -8192
+  %gep = getelementptr i8, ptr %p, i64 16383
   %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
-; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
+define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
+; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0xfff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 4095
-  %load = load i8, ptr %gep, align 4
-  ret i8 %load
-}
-
-define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x1fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 8191
-  %load = load i8, ptr %gep, align 4
-  ret i8 %load
-}
-
-define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x3fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x3fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x3fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 16383
-  %load = load i8, ptr %gep, align 4
-  ret i8 %load
-}
-
-define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xf000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xe000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xc000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xc000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xc000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -16384
   %load = load i8, ptr %gep, align 4
   ret i8 %load
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index c9c6c0912bda902..137c83a0fd80c96 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -72,11 +72,8 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0xfff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -105,11 +102,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -117,11 +111,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -129,12 +120,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x1fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -204,18 +191,14 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_neg_12bit_max:
 ; GFX11:       ; %bb.0:
@@ -223,87 +206,38 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
-; GFX9-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xe000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -320,11 +254,8 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0xfff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -353,11 +284,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -365,11 +293,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -377,12 +302,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x1fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -422,11 +343,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x3fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -434,11 +352,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x3fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -446,12 +361,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x3fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -495,18 +406,14 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX11:       ; %bb.0:
@@ -514,159 +421,70 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
-; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xe000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
-; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xc000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xc000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xc000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index ca79772dbed74c0..536b2d054272ee1 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -3065,33 +3065,31 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX7-LABEL: v_mul_284_add_82_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x52
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7-NEXT:    s_movk_i32 s6, 0x11c
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
-; GFX7-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    s_movk_i32 s4, 0x11c
+; GFX7-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x52
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_284_add_82_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x52
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    s_movk_i32 s6, 0x11c
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    s_movk_i32 s4, 0x11c
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x52
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_284_add_82_i64:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x52
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    s_movk_i32 s6, 0x11c
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[1:2]
@@ -3101,8 +3099,8 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x52
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x11c
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
@@ -3113,9 +3111,9 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX10-LABEL: v_mul_284_add_82_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x52
+; GFX10-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, 0x11c, v0, s[4:5]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, s4, 0x52
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, 0x11c, v2, v[1:2]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %arg, 284
@@ -3139,33 +3137,31 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX7-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x7da667
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7-NEXT:    s_mov_b32 s6, 0x37b4a145
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
-; GFX7-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    s_mov_b32 s4, 0x37b4a145
+; GFX7-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7da667
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7da667
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    s_mov_b32 s6, 0x37b4a145
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x37b4a145
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7da667
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7da667
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    s_mov_b32 s6, 0x37b4a145
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[1:2]
@@ -3175,8 +3171,8 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x7da667
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    s_mov_b32 s6, 0x37b4a145
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
@@ -3187,9 +3183,9 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX10-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x7da667
+; GFX10-NEXT:    s_mov_b32 s4, 0x37b4a145
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, 0x37b4a145, v0, s[4:5]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, s4, 0x7da667
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, 0x37b4a145, v2, v[1:2]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %arg, 934584645
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 0654d555766456e..3dc565ceed0d0ba 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -59,10 +59,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
-; SI-GISEL-NEXT:    s_brev_b32 s3, 8
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -145,10 +143,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0
-; VI-GISEL-NEXT:    s_brev_b32 s3, 8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -243,11 +239,9 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_rsq_f64_fabs:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; SI-GISEL-NEXT:    s_brev_b32 s3, 8
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
@@ -329,11 +323,9 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_rsq_f64_fabs:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    s_brev_b32 s3, 8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
@@ -428,10 +420,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_neg_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
-; SI-GISEL-NEXT:    s_brev_b32 s3, 8
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -514,10 +504,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_neg_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0
-; VI-GISEL-NEXT:    s_brev_b32 s3, 8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -612,11 +600,9 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_neg_rsq_neg_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; SI-GISEL-NEXT:    s_brev_b32 s3, 8
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
@@ -698,11 +684,9 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_neg_rsq_neg_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    s_brev_b32 s3, 8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
@@ -797,11 +781,11 @@ define double @v_rsq_f64(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -879,11 +863,11 @@ define double @v_rsq_f64(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -966,11 +950,11 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_fabs:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -1048,11 +1032,11 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_fabs:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1136,11 +1120,11 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_missing_contract0:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -1218,11 +1202,11 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_missing_contract0:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1305,11 +1289,11 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_missing_contract1:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -1387,11 +1371,11 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_missing_contract1:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1474,11 +1458,11 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-GISEL-LABEL: v_neg_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -1556,11 +1540,11 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-GISEL-LABEL: v_neg_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1680,24 +1664,26 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1825,12 +1811,14 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1855,15 +1843,15 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
@@ -1977,24 +1965,26 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2122,12 +2112,14 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2152,15 +2144,15 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0
@@ -2242,15 +2234,17 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -2258,7 +2252,7 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2358,12 +2352,14 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2388,15 +2384,15 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5]
@@ -2511,15 +2507,17 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -2527,7 +2525,7 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2657,12 +2655,14 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2687,15 +2687,15 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
@@ -2772,11 +2772,11 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -2854,11 +2854,11 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -2943,11 +2943,11 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3025,11 +3025,11 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3104,11 +3104,11 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3174,11 +3174,11 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3249,11 +3249,11 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3319,11 +3319,11 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3395,11 +3395,11 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-GISEL-LABEL: v_neg_rsq_f64__afn:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3467,11 +3467,11 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-GISEL-LABEL: v_neg_rsq_f64__afn:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3543,11 +3543,11 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3613,11 +3613,11 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3688,11 +3688,11 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_nnan:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3758,11 +3758,11 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_nnan:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3833,11 +3833,11 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3903,11 +3903,11 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3979,11 +3979,11 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -4051,11 +4051,11 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4135,11 +4135,11 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -4217,11 +4217,11 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4325,13 +4325,13 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -4339,28 +4339,30 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[10:11], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
@@ -4445,12 +4447,14 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -4543,10 +4547,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ;
 ; SI-GISEL-LABEL: s_rsq_f64_unsafe:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
-; SI-GISEL-NEXT:    s_brev_b32 s3, 8
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -4617,10 +4619,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ;
 ; VI-GISEL-LABEL: s_rsq_f64_unsafe:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0
-; VI-GISEL-NEXT:    s_brev_b32 s3, 8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -4703,11 +4703,11 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-GISEL-LABEL: v_rsq_f64_unsafe:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -4773,11 +4773,11 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-GISEL-LABEL: v_rsq_f64_unsafe:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -5109,14 +5109,15 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-LABEL: v_div_contract_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5128,8 +5129,7 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5190,11 +5190,11 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-LABEL: v_div_contract_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5276,14 +5276,15 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-LABEL: v_div_arcp_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5295,8 +5296,7 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5357,11 +5357,11 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-LABEL: v_div_arcp_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5443,14 +5443,15 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5462,8 +5463,7 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5524,11 +5524,11 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5568,16 +5568,18 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_brev_b32 s7, 8
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x40700000
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0x40700000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
@@ -5600,7 +5602,7 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], s[6:7], v[0:1], s[6:7]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v7
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s8, v7
 ; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
@@ -5611,19 +5613,20 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s6, 0
-; SI-GISEL-NEXT:    s_brev_b32 s7, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0x40700000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -5655,9 +5658,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40700000
 ; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
@@ -5695,12 +5699,13 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index 7027521d7e2dcea..714b2af1698fe1e 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -171,10 +171,10 @@ entry:
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
 ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
 ; CI-NOHSA-NOT: v_add
-; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
 ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
 
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
@@ -201,11 +201,11 @@ entry:
 
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
 
-; SI: s_mov_b32 {{s[0-9]+}}, 0x13480
+; SI-DAG: s_mov_b64 s[{{[0-9:]+}}], 0x13480
 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
-; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64
 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
 ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 4f2fd3f50494c94..9cb6842ae0a1827 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1592,31 +1592,30 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v6, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[5:6]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[5:6]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[5:6]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[8:9], v4
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v9
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 47, v8
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index be0aa394dd99dc0..9cd13b3d4515099 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -1284,18 +1284,15 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr
 ; VI-LABEL: v_shl_i64_32_bit_constant:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_mov_b64 s[0:1], 0x12d687
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_lshl_b64 s[4:5], 0x12d687, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_shl_i64_32_bit_constant:
@@ -1875,30 +1872,28 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_f32_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
-; SI-NEXT:    s_mov_b64 s[0:1], 0x40800000
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_lshl_b64 s[4:5], 0x40800000, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
-; VI-NEXT:    s_mov_b64 s[0:1], 0x40800000
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_lshl_b64 s[4:5], 0x40800000, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
@@ -1931,10 +1926,10 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s0, -4.0
-; SI-NEXT:    s_mov_b32 s1, s6
+; SI-NEXT:    s_mov_b32 s1, -1
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1946,10 +1941,10 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_mov_b32 s0, -4.0
-; VI-NEXT:    s_mov_b32 s1, s6
+; VI-NEXT:    s_mov_b32 s1, -1
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 08db1e7fee259d6..6a4c2c9fcf52852 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -16,2668 +16,2662 @@
 define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX6-LABEL: test:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b32 s44, SCRATCH_RSRC_DWORD0
-; GFX6-NEXT:    s_mov_b32 s45, SCRATCH_RSRC_DWORD1
-; GFX6-NEXT:    s_mov_b32 s46, -1
-; GFX6-NEXT:    s_mov_b32 s47, 0xe8f000
-; GFX6-NEXT:    s_add_u32 s44, s44, s3
+; GFX6-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_mov_b32 s42, -1
+; GFX6-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s40, s40, s3
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
 ; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 13, v0
-; GFX6-NEXT:    s_mov_b32 s18, 0
-; GFX6-NEXT:    s_mov_b32 s19, 0xf000
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    s_movk_i32 s4, 0x80
-; GFX6-NEXT:    s_mov_b32 s5, s18
-; GFX6-NEXT:    s_mov_b64 s[6:7], s[18:19]
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:3968
-; GFX6-NEXT:    s_addc_u32 s45, s45, 0
-; GFX6-NEXT:    s_movk_i32 s8, 0x100
-; GFX6-NEXT:    s_mov_b32 s9, s18
-; GFX6-NEXT:    s_mov_b64 s[10:11], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s12, 0x180
-; GFX6-NEXT:    s_mov_b32 s13, s18
-; GFX6-NEXT:    s_mov_b64 s[14:15], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s20, 0x200
-; GFX6-NEXT:    s_mov_b32 s21, s18
-; GFX6-NEXT:    s_mov_b64 s[22:23], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s24, 0x280
-; GFX6-NEXT:    s_mov_b32 s25, s18
-; GFX6-NEXT:    s_mov_b64 s[26:27], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s28, 0x300
-; GFX6-NEXT:    s_mov_b32 s29, s18
-; GFX6-NEXT:    s_mov_b64 s[30:31], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s36, 0x380
-; GFX6-NEXT:    s_mov_b32 s37, s18
-; GFX6-NEXT:    s_mov_b64 s[38:39], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s40, 0x400
-; GFX6-NEXT:    s_mov_b32 s41, s18
-; GFX6-NEXT:    s_mov_b64 s[42:43], s[18:19]
-; GFX6-NEXT:    s_mov_b64 s[16:17], s[2:3]
+; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, s2, v5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s3
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v0, vcc
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
+; GFX6-NEXT:    s_addc_u32 s41, s41, 0
 ; GFX6-NEXT:    s_mov_b32 s2, 0x3fd00
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0x100
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[12:13], 0x180
+; GFX6-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[16:17], 0x200
+; GFX6-NEXT:    s_mov_b64 s[18:19], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[20:21], 0x280
+; GFX6-NEXT:    s_mov_b64 s[22:23], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[24:25], 0x300
+; GFX6-NEXT:    s_mov_b64 s[26:27], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[28:29], 0x380
+; GFX6-NEXT:    s_mov_b64 s[30:31], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[36:37], 0x400
+; GFX6-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GFX6-NEXT:    s_mov_b32 s33, 0x4f900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1268 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1272 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1276 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1280 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:16 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1300 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:20 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1304 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1308 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1312 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:24 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:28 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:32 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:32
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1332 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:36 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1336 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1340 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1344 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:40 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:44 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:48 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:48
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1364 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:52 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1368 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1372 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1376 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:56 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:60 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:64 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1396 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:68 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1400 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1404 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1408 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:72 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:76 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:80 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1428 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:84 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1432 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1436 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1440 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:88 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:92 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:96 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1460 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:100 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1464 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1468 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1472 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:104 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:108 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:112 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1492 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:116 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1496 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1500 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1504 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:120 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:124 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:128 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1556 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:132 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1560 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1564 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1568 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:136 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:140 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:144 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1588 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:148 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1592 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1596 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1600 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:152 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:156 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:160 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1620 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:164 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1624 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1628 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1632 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:168 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:172 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:176 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1652 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:180 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1656 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1660 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1664 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:184 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:188 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:192 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1684 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:196 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1688 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1692 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1696 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:200 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:204 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:208 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1716 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:212 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1720 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1724 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1728 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:216 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:220 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:224 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1748 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:228 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1752 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1756 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1760 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:232 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:236 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:240 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1780 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:244 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1784 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1788 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1792 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:248 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:252 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:256 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:256
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1860 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:260 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1864 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1868 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1872 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:264 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:268 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:272 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:272
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1892 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:276 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1896 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1900 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1904 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:280 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:284 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:288 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:288
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1924 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:292 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1928 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1932 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1936 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:296 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:300 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:304 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:304
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1956 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:308 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1960 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1964 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1968 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:312 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:316 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:320 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:320
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1988 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:324 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1992 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1996 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2000 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:328 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:332 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:336 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:336
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2020 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:340 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2024 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2028 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2032 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:344 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:348 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:352 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:352
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2052 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:356 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2056 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2060 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2064 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:360 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:364 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:368 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:368
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2084 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:372 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2088 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2092 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2096 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:376 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:380 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:384 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:384
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2148 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:388 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2152 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2156 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2160 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:392 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:396 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:400 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2180 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:404 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2184 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2188 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2192 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:408 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:412 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:416 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:416
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2212 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:420 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2216 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2220 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2224 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:424 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:428 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:432 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:432
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2244 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:436 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2248 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2252 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2256 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:440 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:444 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:448 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:448
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2276 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:452 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2280 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2284 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2288 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:456 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:460 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:464 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:464
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2308 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:468 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2312 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2316 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2320 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:472 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:476 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:480 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:480
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2340 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:484 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2344 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2348 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2352 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:488 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:492 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:496 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:496
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2372 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:500 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2376 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2380 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2384 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:504 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:508 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:512 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:512
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2452 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:516 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2456 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2460 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2464 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:520 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:524 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:528 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:528
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2484 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:532 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2488 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2492 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2496 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:536 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:540 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:544 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:544
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2516 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:548 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2520 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2524 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2528 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:552 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:556 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:560 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:560
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2548 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:564 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2552 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2556 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2560 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:568 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:572 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:576 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:576
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2580 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:580 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2584 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2588 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2592 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:584 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:588 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:592 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:592
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2612 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:596 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2616 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2620 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2624 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:600 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:604 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:608 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:608
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2644 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:612 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2648 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2652 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2656 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:616 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:620 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:624 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:624
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2676 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:628 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2680 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2684 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2688 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:632 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:636 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:640 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:640
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2740 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:644 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2744 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2748 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2752 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:648 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:652 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:656 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:656
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2772 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:660 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2776 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2780 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2784 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:664 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:668 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:672 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:672
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2804 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:676 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2808 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2812 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2816 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:680 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:684 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:688 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:688
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2836 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:692 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2840 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2844 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2848 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:696 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:700 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:704 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:704
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2868 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:708 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2872 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2876 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2880 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:712 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:716 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:720 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:720
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2900 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:724 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2904 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2908 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2912 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:728 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:732 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:736 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:736
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2932 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:740 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2936 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2940 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2944 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:744 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:748 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:752 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:752
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2964 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:756 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2968 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2972 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2976 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:760 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:764 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:768 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:768
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3044 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:772 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3048 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3052 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3056 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:776 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:780 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:784 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:784
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3076 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:788 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3080 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3084 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3088 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:792 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:796 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:800 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3108 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:804 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3112 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3116 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3120 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:808 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:812 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:816 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:816
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3140 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:820 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3144 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3148 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3152 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:824 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:828 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:832 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:832
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3172 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:836 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3176 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3180 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3184 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:840 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:844 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:848 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:848
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3204 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:852 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3208 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3212 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3216 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:856 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:860 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:864 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:864
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3236 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:868 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3240 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3244 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3248 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:872 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:876 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:880 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:880
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3268 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:884 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3272 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3276 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3280 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:888 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:892 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:896 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:896
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3332 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:900 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3336 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3340 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3344 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:904 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:908 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:912 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:912
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3364 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:916 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3368 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3372 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3376 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:920 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:924 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:928 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:928
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3396 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:932 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3400 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3404 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3408 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:936 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:940 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:944 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:944
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3428 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:948 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3432 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3436 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3440 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:952 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:956 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:960 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:960
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3460 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:964 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3464 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3468 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3472 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:968 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:972 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:976 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:976
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3492 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:980 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3496 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3500 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3504 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:984 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:988 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:992 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:992
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3524 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:996 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3528 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3532 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3536 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[40:43], 0 addr64 offset:4080
-; GFX6-NEXT:    s_waitcnt expcnt(3)
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, s0, v5
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1000 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1004 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1008 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1008
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3556 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1012 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3560 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3564 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3568 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1016 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1020 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1024 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1024
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1028 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:16 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1032 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1036 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1040 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:16
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1040
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:20 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1044 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:24 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:28 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:32 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1048 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1052 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1056 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:32
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1056
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:36 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1060 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:40 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:44 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:48 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1064 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1068 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1072 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:48
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1072
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:52 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1076 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:56 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:60 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:64 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1080 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1084 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1088 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:64
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1088
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:68 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1092 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:72 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:76 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:80 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1096 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1100 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1104 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:80
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1104
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:84 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1108 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:88 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:92 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:96 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1112 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1116 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1120 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:96
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1120
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:100 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1124 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:104 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:108 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:112 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1128 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1132 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1136 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:112
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1136
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:116 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1140 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:120 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:124 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:128 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1144 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1148 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1152 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:128
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1152
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:132 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1156 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:136 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:140 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:144 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1160 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1164 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1168 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:144
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1168
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:148 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1172 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:152 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:156 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:160 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1176 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1180 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1184 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:160
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1184
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:164 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1188 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:168 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:172 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:176 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1192 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1196 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1200 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:176
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:180 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1204 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:184 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:188 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:192 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1208 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1212 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1216 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:192
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1216
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:196 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1220 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:200 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:204 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:208 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1224 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1228 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1232 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:208
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1232
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:212 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1236 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:216 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:220 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:224 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1240 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1244 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1248 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:224
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1248
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:228 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1252 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:232 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:236 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:240 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1256 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1260 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1264 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:240
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1264
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:244 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1268 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:248 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:252 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:256 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1272 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1276 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1280 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:256
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1280
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:260 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1284 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:264 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:268 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:272 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1288 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1292 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1296 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:272
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1296
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:276 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1300 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:280 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:284 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:288 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1304 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1308 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1312 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:288
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1312
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:292 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1316 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:296 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:300 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:304 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1320 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1324 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1328 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:304
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1328
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:308 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1332 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:312 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:316 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:320 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1336 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1340 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1344 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:320
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1344
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:324 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1348 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:328 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:332 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:336 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1352 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1356 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1360 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:336
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1360
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:340 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1364 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:344 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:348 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:352 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1368 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1372 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1376 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:352
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1376
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:356 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1380 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:360 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:364 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:368 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1384 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1388 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1392 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:368
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1392
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:372 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1396 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:376 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:380 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:384 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1400 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1404 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1408 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:384
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1408
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:388 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1412 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:392 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:396 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:400 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1416 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1420 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1424 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:400
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1424
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:404 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1428 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:408 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:412 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:416 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1432 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1436 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1440 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:416
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1440
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:420 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1444 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:424 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:428 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:432 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1448 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1452 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1456 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:432
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1456
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:436 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1460 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:440 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:444 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:448 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1464 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1468 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1472 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:448
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1472
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:452 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1476 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:456 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:460 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:464 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1480 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1484 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1488 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:464
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1488
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:468 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1492 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:472 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:476 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:480 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1496 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1500 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1504 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:480
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1504
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:484 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1508 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:488 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:492 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:496 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1512 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1516 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1520 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:496
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1520
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:500 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1524 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:504 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:508 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:512 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1528 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1532 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1536 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:512
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1536
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:516 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1540 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:520 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:524 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:528 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1544 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1548 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1552 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:528
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1552
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:532 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1556 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:536 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:540 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:544 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1560 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1564 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1568 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:544
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1568
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:548 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1572 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:552 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:556 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:560 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1576 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1580 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1584 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:560
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1584
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:564 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1588 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:568 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:572 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:576 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1592 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1596 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1600 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:576
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:580 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1604 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:584 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:588 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:592 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1608 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1612 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1616 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:592
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1616
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:596 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1620 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:600 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:604 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:608 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1624 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1628 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1632 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:608
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1632
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:612 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1636 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:616 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:620 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:624 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1640 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1644 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1648 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:624
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1648
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:628 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1652 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:632 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:636 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:640 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1656 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1660 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1664 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:640
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1664
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:644 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1668 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:648 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:652 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:656 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1672 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1676 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1680 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:656
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1680
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:660 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1684 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:664 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:668 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:672 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1688 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1692 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1696 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:672
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1696
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:676 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1700 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:680 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:684 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:688 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1704 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1708 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1712 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:688
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1712
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:692 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1716 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:696 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:700 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:704 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1720 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1724 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1728 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:704
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1728
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:708 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1732 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:712 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:716 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:720 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1736 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1740 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1744 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:720
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1744
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:724 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1748 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:728 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:732 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:736 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1752 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1756 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1760 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:736
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1760
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:740 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1764 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:744 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:748 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:752 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1768 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1772 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1776 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:752
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1776
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:756 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1780 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:760 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:764 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:768 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1784 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1788 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1792 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:768
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1792
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:772 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1796 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:776 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:780 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:784 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1800 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1804 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1808 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:784
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1808
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:788 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1812 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:792 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:796 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:800 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1816 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1820 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1824 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:800
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1824
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:804 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1828 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:808 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:812 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:816 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1832 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1836 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1840 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:816
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1840
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:820 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1844 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:824 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:828 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:832 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1848 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1852 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1856 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:832
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1856
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:836 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1860 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:840 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:844 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:848 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1864 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1868 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1872 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:848
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1872
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:852 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1876 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:856 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:860 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:864 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1880 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1884 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1888 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:864
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1888
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:868 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1892 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:872 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:876 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:880 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1896 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1900 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1904 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:880
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1904
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:884 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1908 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:888 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:892 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:896 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1912 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1916 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1920 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:896
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1920
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:900 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1924 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:904 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:908 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:912 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1928 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1932 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1936 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:912
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1936
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:916 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1940 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:920 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:924 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:928 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1944 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1948 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1952 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:928
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1952
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:932 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1956 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:936 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:940 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:944 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1960 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1964 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1968 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:944
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1968
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:948 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1972 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:952 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:956 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:960 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1976 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1980 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1984 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:960
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1984
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:964 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1988 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:968 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:972 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:976 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1992 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1996 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2000 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:976
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:980 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2004 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:984 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:988 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:992 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2008 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2012 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2016 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:992
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2016
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:996 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2020 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1000 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1004 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1008 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2024 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2028 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2032 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1008
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2032
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1012 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2036 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1016 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1020 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1024 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2040 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2044 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2048 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1024
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2048
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1028 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2052 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1032 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1036 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1040 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2056 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2060 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2064 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1040
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2064
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1044 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2068 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1048 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1052 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1056 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2072 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2076 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2080 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1056
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2080
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1060 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2084 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1064 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1068 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1072 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2088 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2092 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2096 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1072
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2096
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1076 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2100 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1080 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1084 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1088 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2104 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2108 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2112 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1088
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2112
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1092 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2116 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1096 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1100 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1104 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2120 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2124 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2128 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1104
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2128
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1108 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2132 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1112 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1116 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1120 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2136 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2140 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2144 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1120
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2144
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1124 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2148 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1128 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1132 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1136 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2152 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2156 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2160 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1136
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2160
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1140 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2164 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1144 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1148 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1152 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2168 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2172 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2176 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1152
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2176
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1156 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2180 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1160 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1164 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1168 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2184 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2188 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2192 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1168
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2192
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1172 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2196 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1176 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1180 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1184 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2200 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2204 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2208 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1184
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2208
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1188 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2212 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1192 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1196 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1200 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2216 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2220 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2224 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1200
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2224
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1204 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2228 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1208 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1212 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1216 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2232 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2236 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2240 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1216
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2240
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1220 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2244 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1224 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1228 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1232 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2248 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2252 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2256 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1232
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2256
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1236 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2260 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1240 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1244 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1248 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2264 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2268 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2272 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1248
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2272
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1252 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2276 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1256 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1260 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1264 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2280 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2284 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2288 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1264
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2288
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1284 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2292 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1288 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1292 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1296 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2296 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2300 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2304 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1280
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2304
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1316 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2308 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1320 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1324 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1328 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2312 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2316 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2320 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1296
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2320
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1348 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2324 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1352 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1356 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1360 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2328 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2332 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2336 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1312
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2336
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1380 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2340 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1384 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1388 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1392 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2344 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2348 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2352 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1328
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2352
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1412 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2356 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1416 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1420 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1424 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2360 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2364 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2368 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1344
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2368
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1444 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2372 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1448 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1452 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1456 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2376 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2380 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2384 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1360
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2384
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1476 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2388 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1480 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1484 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1488 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2392 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2396 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2400 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1376
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1508 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2404 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1512 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1516 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1520 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2408 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2412 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2416 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1392
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2416
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1524 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2420 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1528 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1532 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1536 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2424 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2428 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2432 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1408
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2432
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1540 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2436 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1544 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1548 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1552 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2440 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2444 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2448 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1424
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2448
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1572 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2452 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1576 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1580 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1584 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2456 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2460 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2464 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1440
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2464
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1604 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2468 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1608 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1612 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1616 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2472 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2476 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2480 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1456
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2480
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1636 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2484 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1640 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1644 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1648 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2488 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2492 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2496 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1472
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2496
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1668 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2500 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1672 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1676 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1680 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2504 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2508 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2512 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1488
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2512
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1700 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2516 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1704 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1708 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1712 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2520 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2524 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2528 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1504
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2528
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1732 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2532 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1736 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1740 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1744 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2536 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2540 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2544 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1520
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2544
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1764 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2548 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1768 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1772 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1776 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2552 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2556 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2560 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1536
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2560
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1796 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2564 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1800 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1804 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1808 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2568 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2572 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2576 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1552
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2576
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1812 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2580 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1816 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1820 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1824 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2584 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2588 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2592 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1568
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2592
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1828 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2596 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1832 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1836 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1840 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2600 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2604 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2608 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1584
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2608
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1844 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2612 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1848 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1852 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1856 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2616 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2620 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2624 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1600
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2624
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1876 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2628 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1880 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1884 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1888 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2632 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2636 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2640 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1616
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2640
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1908 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2644 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1912 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1916 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1920 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2648 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2652 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2656 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1632
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2656
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1940 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2660 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1944 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1948 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1952 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2664 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2668 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2672 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1648
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2672
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1972 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2676 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1976 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1980 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1984 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2680 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2684 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2688 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1664
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2688
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2004 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2692 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2008 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2012 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2016 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2696 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2700 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2704 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1680
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2704
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2036 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2708 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2040 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2044 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2048 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2712 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2716 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2720 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1696
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2720
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2068 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2724 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2072 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2076 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2080 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2728 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2732 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2736 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1712
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2736
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2100 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2740 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2104 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2108 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2112 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2744 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2748 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2752 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1728
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2752
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2116 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2756 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2120 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2124 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2128 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2760 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2764 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2768 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1744
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2768
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2132 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2772 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2136 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2140 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2144 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2776 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2780 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2784 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1760
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2784
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2164 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2788 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2168 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2172 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2176 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2792 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2796 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2800 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1776
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2196 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2804 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2200 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2204 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2208 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2808 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2812 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2816 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1792
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2816
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2228 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2820 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2232 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2236 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2240 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2824 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2828 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2832 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1808
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2832
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2260 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2836 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2264 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2268 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2272 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2840 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2844 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2848 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1824
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2848
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2292 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2852 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2296 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2300 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2304 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2856 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2860 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2864 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1840
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2864
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2324 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2868 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2328 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2332 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2336 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2872 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2876 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2880 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1856
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2880
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2356 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2884 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2360 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2364 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2368 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2888 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2892 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2896 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1872
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2896
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2388 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2900 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2392 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2396 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2400 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2904 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2908 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2912 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1888
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2912
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2404 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2916 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2408 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2412 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2416 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2920 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2924 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2928 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1904
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2928
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2420 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2932 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2424 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2428 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2432 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2936 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2940 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2944 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1920
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2944
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2436 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2948 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2440 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2444 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2448 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2952 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2956 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2960 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1936
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2960
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2468 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2964 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2472 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2476 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2480 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2968 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2972 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2976 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1952
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2976
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2500 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2980 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2504 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2508 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2512 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2984 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2988 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2992 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2992
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2532 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2996 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2536 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2540 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2544 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3000 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3004 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3008 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3008
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2564 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3012 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2568 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2572 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2576 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3016 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3020 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3024 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3024
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2596 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3028 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2600 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2604 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2608 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3032 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3036 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3040 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3040
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2628 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3044 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2632 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2636 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2640 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3048 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3052 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3056 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3056
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2660 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3060 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2664 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2668 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2672 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3064 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3068 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3072 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3072
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2692 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3076 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2696 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2700 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2704 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3080 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3084 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3088 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3088
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2708 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3092 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2712 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2716 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2720 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3096 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3100 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3104 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3104
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2724 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3108 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2728 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2732 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2736 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3112 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3116 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3120 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2096
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3120
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2756 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3124 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2760 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2764 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2768 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3128 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3132 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3136 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2112
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3136
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2788 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3140 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2792 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2796 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2800 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3144 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3148 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3152 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2128
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3152
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2820 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3156 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2824 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2828 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2832 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3160 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3164 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3168 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2144
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3168
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2852 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3172 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2856 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2860 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2864 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3176 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3180 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3184 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2160
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3184
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2884 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3188 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2888 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2892 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2896 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3192 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3196 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3200 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2176
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2916 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3204 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2920 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2924 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2928 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3208 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3212 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3216 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2192
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3216
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2948 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3220 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2952 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2956 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2960 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3224 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3228 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3232 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2208
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3232
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2980 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3236 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2984 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2988 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2992 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3240 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3244 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3248 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2224
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3248
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2996 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3252 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3000 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3004 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3008 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3256 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3260 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3264 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2240
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3264
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3012 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3268 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3016 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3020 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3024 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3272 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3276 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3280 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2256
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3280
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3028 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3284 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3032 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3036 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3040 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3288 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3292 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3296 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2272
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3296
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3060 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3300 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3064 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3068 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3072 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3304 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3308 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3312 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2288
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3312
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3092 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3316 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3096 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3100 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3104 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3320 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3324 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3328 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2304
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3328
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3124 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3332 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3128 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3132 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3136 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3336 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3340 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3344 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2320
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3344
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3156 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3348 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3160 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3164 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3168 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3352 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3356 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3360 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2336
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3360
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3188 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3364 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3192 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3196 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3200 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3368 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3372 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3376 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2352
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3376
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3220 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3380 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3224 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3228 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3232 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3384 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3388 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3392 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2368
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3392
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3252 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3396 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3256 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3260 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3264 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3400 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3404 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3408 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2384
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3408
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3284 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3412 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3288 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3292 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3296 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3416 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3420 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3424 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2400
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3424
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3300 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3428 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3304 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3308 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3312 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3432 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3436 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3440 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2416
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3440
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3316 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3444 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3320 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3324 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3328 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3448 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3452 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3456 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2432
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3456
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3348 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3460 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3352 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3356 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3360 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3464 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3468 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3472 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2448
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3472
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3380 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3476 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3384 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3388 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3392 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3480 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3484 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3488 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2464
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3488
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3412 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3492 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3416 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3420 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3424 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3496 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3500 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3504 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2480
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3504
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3444 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3508 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3448 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3452 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3456 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3512 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3516 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3520 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2496
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3520
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3476 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3524 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3480 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3484 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3488 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3528 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3532 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3536 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2512
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3536
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3508 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3540 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3512 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3516 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3520 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3544 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3548 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3552 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2528
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3552
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3540 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3556 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3544 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3548 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3552 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3560 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3564 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3568 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2544
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3568
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3572 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3572 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3576 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3580 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3584 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3576 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3580 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3584 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2560
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3584
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3588 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3588 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3592 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3596 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3600 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3592 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3596 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3600 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2576
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3604 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3604 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3608 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3612 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3616 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3608 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3612 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3616 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2592
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3616
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3620 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3620 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3624 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3628 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3632 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3624 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3628 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3632 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2608
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3632
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3636 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3636 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3640 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3644 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3648 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3640 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3644 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3648 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2624
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3648
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3652 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3652 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3656 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3660 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3664 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3656 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3660 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3664 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2640
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3664
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3668 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3668 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3672 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3676 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3680 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3672 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3676 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3680 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2656
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3680
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3684 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3684 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3688 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3692 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3696 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3688 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3692 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3696 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2672
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3696
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3700 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3700 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3704 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3708 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3712 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3704 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3708 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3712 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2688
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3712
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3716 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3716 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3720 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3724 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3728 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3720 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3724 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3728 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2704
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3728
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3732 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3732 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3736 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3740 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3744 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3736 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3740 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3744 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2720
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3744
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3748 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3748 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3752 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3756 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3760 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3752 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3756 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3760 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2736
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3760
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3764 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3764 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3768 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3772 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3776 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3768 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3772 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3776 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2752
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3776
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3780 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3780 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3784 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3788 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3792 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3784 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3788 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3792 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2768
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3792
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3796 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3796 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3800 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3804 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3808 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3800 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3804 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3808 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2784
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3808
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3812 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3812 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3816 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3820 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3824 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3816 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3820 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3824 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2800
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3824
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3828 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3828 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3832 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3836 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3840 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3832 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3836 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3840 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2816
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3840
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3844 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3844 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3848 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3852 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3856 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3848 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3852 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3856 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2832
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3856
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3860 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3860 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3864 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3868 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3872 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3864 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3868 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3872 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2848
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3872
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3876 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3876 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3880 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3884 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3888 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3880 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3884 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3888 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2864
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3888
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3892 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3892 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3896 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3900 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3904 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3896 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3900 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3904 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2880
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3904
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3908 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3908 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3912 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3916 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3920 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3912 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3916 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3920 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2896
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3920
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3924 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3924 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3928 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3932 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3936 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3928 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3932 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3936 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2912
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3936
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3940 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3940 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3944 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3948 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3952 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3944 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3948 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3952 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2928
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3952
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3956 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3956 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3960 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3964 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3968 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3960 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3964 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3968 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2944
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3972 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3972 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3976 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3980 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3984 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3976 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3980 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3984 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2960
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3988 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3988 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3992 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3996 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4000 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3992 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3996 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4000 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2976
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4004 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4004 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:4008 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:4012 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4016 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4008 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4012 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4016 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2992
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4020 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4020 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:4024 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:4028 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4032 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4024 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4028 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4032 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3008
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4036 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4036 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:4040 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:4044 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4048 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4040 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4044 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4048 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3024
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4052 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4052 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:4056 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:4060 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4064 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4056 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4060 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4064 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3040
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4068 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4068 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:4072 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:4076 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4080 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4072 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4076 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4080 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3056
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4080
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0x80
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3072
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x40100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3088
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x40500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3104
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x40900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3120
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x40d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3136
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x41100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3152
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x41500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3168
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x41900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3184
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x41d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3200
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x42100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3216
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x42500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3232
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x42900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3248
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x42d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3264
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x43100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3280
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x43500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3296
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x43900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3312
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x43d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3328
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x44100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3344
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x44500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3360
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x44900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3376
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x44d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3392
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x45100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3408
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x45500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3424
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x45900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3440
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x45d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3456
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x46100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3472
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x46500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3488
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x46900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3504
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x46d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3520
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x47100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3536
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x47500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3552
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x47900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3568
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x47d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3584
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x48100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3600
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x48500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3616
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x48900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3632
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x48d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3648
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x49100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3664
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x49500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3680
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x49900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3696
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x49d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3712
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4a100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3728
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4a500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3744
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4a900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3760
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4ad00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3776
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4b100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3792
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4b500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3808
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4b900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3824
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4bd00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3840
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4c100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3856
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4c500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3872
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4c900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3888
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4cd00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3904
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4d100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3920
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4d500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3936
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4d900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3952
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4dd00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4e100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4e500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4e900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4ed00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4f100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4f500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4f900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4080
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, s0, v5
+; GFX6-NEXT:    v_mov_b32_e32 v4, s1
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
@@ -2690,2304 +2684,2303 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3556 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3560 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3564 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3568 ; 4-byte Folded Reload
-; GFX6-NEXT:    v_mov_b32_e32 v4, s1
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; GFX6-NEXT:    s_mov_b64 s[2:3], s[18:19]
+; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3524 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3528 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3532 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3536 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4f500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3492 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3496 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3500 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3504 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4f100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3460 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3464 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3468 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3472 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4ed00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3428 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3432 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3436 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3440 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4e900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3396 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3400 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3404 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3408 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4e500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3364 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3368 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3372 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3376 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4e100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3332 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3336 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3340 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3344 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4dd00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3268 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3272 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3276 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3280 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4d900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3236 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3240 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3244 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3248 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4d500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3204 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3208 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3212 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3216 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4d100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3172 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3176 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3180 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3184 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4cd00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3140 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3144 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3148 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3152 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4c900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3108 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3112 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3116 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3120 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4c500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3076 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3080 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3084 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3088 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4c100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3044 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3048 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3052 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3056 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s28, 0x4bd00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2964 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2968 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2972 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2976 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4b900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2932 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2936 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2940 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2944 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4b500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2900 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2904 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2908 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2912 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4b100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2868 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2872 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2876 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2880 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4ad00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2836 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2840 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2844 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2848 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4a900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2804 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2808 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2812 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2816 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4a500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2772 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2776 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2780 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2784 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4a100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2740 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2744 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2748 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2752 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s24, 0x49d00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2676 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2680 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2684 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2688 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x49900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2644 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2648 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2652 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2656 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x49500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2612 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2616 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2620 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2624 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x49100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2580 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2584 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2588 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2592 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x48d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2548 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2552 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2556 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2560 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x48900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2516 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2520 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2524 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2528 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x48500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2484 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2488 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2492 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2496 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x48100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2452 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2456 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2460 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2464 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s20, 0x47d00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2372 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2376 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2380 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2384 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x47900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2340 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2344 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2348 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2352 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x47500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2308 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2312 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2316 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2320 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x47100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2276 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2280 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2284 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2288 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x46d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2244 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2248 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2252 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2256 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x46900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2212 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2216 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2220 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2224 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x46500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2180 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2184 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2188 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2192 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x46100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2148 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2152 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2156 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2160 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s16, 0x45d00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2084 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2088 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2092 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2096 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x45900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2052 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2056 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2060 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2064 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x45500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2020 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2024 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2028 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2032 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x45100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1988 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1992 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1996 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2000 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x44d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1956 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1960 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1964 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1968 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x44900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1924 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1928 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1932 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1936 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x44500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1892 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1896 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1900 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1904 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x44100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1860 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1864 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1868 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1872 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s12, 0x43d00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1780 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1784 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1788 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1792 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x43900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1748 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1752 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1756 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1760 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x43500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1716 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1720 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1724 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1728 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x43100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1684 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1688 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1692 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1696 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x42d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1652 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1656 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1660 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1664 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x42900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1620 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1624 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1628 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1632 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x42500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1588 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1592 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1596 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1600 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x42100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1556 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1560 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1564 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1568 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s8, 0x41d00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1492 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1496 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1500 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1504 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x41900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1460 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1464 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1468 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1472 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x41500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1428 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1432 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1436 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1440 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x41100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1396 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1400 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1404 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1408 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x40d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1364 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1368 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1372 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1376 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x40900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1332 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1336 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1340 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1344 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x40500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1300 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1304 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1308 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1312 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x40100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1268 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1272 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1276 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1280 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s4, 0x3fd00
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:3968
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4080
-; GFX6-NEXT:    s_mov_b32 s4, 0x4f900
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4f500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4068 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4072 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4076 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4080 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4f100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4052 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4056 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4060 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4064 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4ed00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4036 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4040 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4044 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4048 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4e900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4020 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4024 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4028 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4032 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4e500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4004 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4008 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4012 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4016 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4e100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3988 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3992 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3996 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4000 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4dd00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3972 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3976 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3980 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3984 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4d900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3956 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3960 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3964 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3968 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3952
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4d500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3940 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3944 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3948 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3952 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3936
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4d100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3924 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3928 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3932 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3936 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3920
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4cd00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3908 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3912 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3916 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3920 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3904
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4c900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3892 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3896 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3900 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3904 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3888
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4c500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3876 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3880 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3884 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3888 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3872
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4c100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3860 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3864 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3868 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3872 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3856
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4bd00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3844 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3848 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3852 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3856 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3840
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4b900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3828 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3832 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3836 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3840 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3824
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4b500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3812 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3816 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3820 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3824 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3808
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4b100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3796 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3800 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3804 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3808 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3792
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4ad00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3780 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3784 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3788 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3792 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3776
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4a900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3764 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3768 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3772 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3776 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3760
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4a500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3748 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3752 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3756 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3760 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3744
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4a100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3732 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3736 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3740 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3744 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3728
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x49d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3716 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3720 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3724 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3728 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3712
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x49900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3700 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3704 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3708 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3712 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3696
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x49500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3684 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3688 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3692 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3696 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3680
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x49100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3668 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3672 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3676 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3680 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3664
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x48d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3652 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3656 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3660 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3664 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3648
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x48900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3636 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3640 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3644 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3648 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3632
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x48500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3620 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3624 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3628 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3632 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3616
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x48100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3604 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3608 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3612 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3616 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3600
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x47d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3588 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3592 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3596 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3600 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3584
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x47900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3572 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3576 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3580 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3584 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3568
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x47500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3556 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3560 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3564 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3568 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3552
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x47100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3540 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3544 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3548 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3552 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3536
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x46d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3524 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3528 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3532 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3536 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3520
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x46900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3508 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3512 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3516 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3520 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3504
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x46500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3492 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3496 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3500 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3504 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3488
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x46100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3476 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3480 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3484 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3488 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3472
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x45d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3460 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3464 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3468 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3472 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3456
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x45900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3444 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3448 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3452 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3456 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3440
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x45500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3428 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3432 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3436 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3440 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3424
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x45100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3412 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3416 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3420 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3424 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3408
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x44d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3396 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3400 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3404 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3408 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3392
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x44900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3380 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3384 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3388 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3392 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3376
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x44500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3364 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3368 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3372 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3376 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3360
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x44100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3348 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3352 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3356 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3360 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3344
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x43d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3332 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3336 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3340 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3344 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3328
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x43900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3316 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3320 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3324 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3328 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3312
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x43500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3300 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3304 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3308 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3312 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3296
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x43100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3284 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3288 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3292 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3296 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3280
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x42d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3268 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3272 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3276 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3280 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3264
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x42900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3252 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3256 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3260 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3264 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3248
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x42500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3236 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3240 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3244 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3248 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3232
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x42100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3220 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3224 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3228 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3232 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3216
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x41d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3204 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3208 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3212 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3216 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3200
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x41900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3188 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3192 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3196 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3200 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3184
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x41500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3172 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3176 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3180 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3184 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3168
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x41100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3156 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3160 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3164 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3168 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3152
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x40d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3140 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3144 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3148 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3152 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3136
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x40900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3124 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3128 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3132 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3136 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3120
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x40500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3108 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3112 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3116 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3120 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3104
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x40100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3092 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3096 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3100 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3104 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3088
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x3fd00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3076 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3080 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3084 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3088 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3072
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3060 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3064 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3068 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3072 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3056
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4068 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:4072 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:4076 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4080 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3044 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3048 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3052 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3056 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3040
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4052 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:4056 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:4060 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4064 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3028 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3032 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3036 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3040 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3024
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4036 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:4040 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:4044 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4048 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3012 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3016 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3020 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3024 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3008
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4020 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:4024 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:4028 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4032 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2996 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3000 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3004 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3008 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2992
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4004 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:4008 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:4012 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4016 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2980 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2984 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2988 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2992 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2976
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3988 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3992 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3996 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4000 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2964 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2968 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2972 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2976 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2960
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3972 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3976 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3980 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3984 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2948 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2952 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2956 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2960 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2944
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3956 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3960 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3964 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3968 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2932 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2936 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2940 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2944 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2928
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3940 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3944 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3948 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3952 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2916 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2920 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2924 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2928 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2912
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3924 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3928 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3932 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3936 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2900 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2904 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2908 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2912 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2896
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3908 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3912 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3916 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3920 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2884 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2888 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2892 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2896 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2880
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3892 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3896 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3900 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3904 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2868 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2872 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2876 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2880 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2864
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3876 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3880 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3884 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3888 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2852 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2856 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2860 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2864 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2848
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3860 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3864 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3868 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3872 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2836 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2840 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2844 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2848 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2832
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3844 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3848 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3852 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3856 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2820 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2824 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2828 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2832 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2816
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3828 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3832 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3836 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3840 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2804 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2808 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2812 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2816 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3812 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3816 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3820 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3824 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2788 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2792 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2796 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2800 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2784
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3796 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3800 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3804 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3808 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2772 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2776 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2780 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2784 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2768
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3780 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3784 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3788 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3792 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2756 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2760 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2764 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2768 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2752
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3764 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3768 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3772 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3776 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2740 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2744 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2748 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2752 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2736
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3748 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3752 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3756 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3760 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2724 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2728 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2732 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2736 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2720
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3732 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3736 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3740 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3744 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2708 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2712 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2716 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2720 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2704
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3716 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3720 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3724 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3728 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2692 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2696 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2700 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2704 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2688
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3700 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3704 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3708 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3712 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2676 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2680 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2684 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2688 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2672
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3684 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3688 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3692 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3696 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2660 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2664 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2668 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2672 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2656
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3668 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3672 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3676 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3680 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2644 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2648 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2652 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2656 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2640
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3652 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3656 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3660 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3664 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2628 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2632 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2636 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2640 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2624
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3636 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3640 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3644 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3648 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2612 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2616 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2620 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2624 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2608
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3620 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3624 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3628 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3632 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2596 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2600 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2604 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2608 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2592
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3604 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3608 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3612 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3616 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2580 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2584 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2588 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2592 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2576
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3588 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3592 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3596 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3600 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2564 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2568 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2572 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2576 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2560
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3572 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3576 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3580 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3584 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2548 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2552 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2556 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2560 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2544
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3540 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3544 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3548 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3552 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2532 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2536 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2540 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2544 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2528
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3508 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3512 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3516 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3520 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2516 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2520 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2524 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2528 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2512
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3476 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3480 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3484 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3488 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2500 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2504 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2508 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2512 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2496
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3444 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3448 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3452 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3456 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2484 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2488 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2492 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2496 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2480
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3412 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3416 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3420 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3424 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2468 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2472 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2476 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2480 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2464
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3380 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3384 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3388 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3392 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2452 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2456 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2460 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2464 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2448
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3348 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3352 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3356 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3360 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2436 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2440 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2444 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2448 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2432
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3316 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3320 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3324 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3328 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2420 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2424 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2428 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2432 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2416
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3300 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3304 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3308 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3312 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2404 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2408 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2412 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2416 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3284 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3288 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3292 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3296 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2388 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2392 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2396 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2400 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2384
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3252 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3256 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3260 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3264 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2372 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2376 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2380 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2384 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2368
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3220 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3224 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3228 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3232 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2356 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2360 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2364 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2368 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2352
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3188 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3192 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3196 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3200 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2340 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2344 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2348 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2352 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2336
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3156 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3160 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3164 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3168 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2324 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2328 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2332 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2336 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2320
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3124 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3128 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3132 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3136 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2308 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2312 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2316 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2320 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2304
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3092 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3096 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3100 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3104 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2292 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2296 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2300 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2304 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2288
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3060 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3064 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3068 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3072 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2276 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2280 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2284 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2288 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2272
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3028 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3032 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3036 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3040 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2260 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2264 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2268 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2272 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2256
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3012 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3016 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3020 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3024 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2244 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2248 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2252 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2256 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2996 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3000 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3004 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3008 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2228 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2232 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2236 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2240 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2980 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2984 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2988 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2992 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2212 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2216 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2220 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2224 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2948 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2952 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2956 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2960 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2196 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2200 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2204 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2208 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2916 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2920 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2924 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2928 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2180 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2184 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2188 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2192 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2884 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2888 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2892 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2896 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2164 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2168 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2172 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2176 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2852 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2856 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2860 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2864 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2148 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2152 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2156 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2160 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2820 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2824 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2828 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2832 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2132 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2136 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2140 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2144 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2788 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2792 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2796 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2800 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2116 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2120 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2124 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2128 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2756 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2760 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2764 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2768 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2100 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2104 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2108 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2112 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2096
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2724 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2728 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2732 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2736 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2084 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2088 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2092 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2096 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2708 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2712 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2716 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2720 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2068 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2072 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2076 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2080 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2692 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2696 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2700 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2704 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2052 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2056 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2060 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2064 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2660 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2664 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2668 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2672 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2036 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2040 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2044 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2048 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2628 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2632 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2636 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2640 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2020 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2024 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2028 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2032 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2596 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2600 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2604 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2608 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2004 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2008 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2012 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2016 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2564 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2568 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2572 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2576 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1988 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1992 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1996 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2000 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2532 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2536 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2540 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2544 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1972 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1976 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1980 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1984 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1968
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2500 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2504 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2508 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2512 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1956 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1960 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1964 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1968 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1952
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2468 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2472 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2476 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2480 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1940 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1944 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1948 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1952 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1936
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2436 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2440 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2444 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2448 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1924 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1928 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1932 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1936 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1920
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2420 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2424 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2428 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2432 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1908 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1912 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1916 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1920 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1904
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2404 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2408 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2412 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2416 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1892 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1896 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1900 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1904 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1888
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2388 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2392 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2396 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2400 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1876 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1880 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1884 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1888 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1872
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2356 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2360 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2364 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2368 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1860 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1864 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1868 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1872 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1856
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2324 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2328 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2332 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2336 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1844 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1848 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1852 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1856 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1840
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2292 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2296 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2300 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2304 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1828 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1832 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1836 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1840 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1824
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2260 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2264 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2268 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2272 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1812 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1816 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1820 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1824 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1808
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2228 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2232 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2236 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2240 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1796 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1800 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1804 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1808 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1792
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2196 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2200 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2204 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2208 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1780 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1784 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1788 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1792 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1776
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2164 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2168 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2172 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2176 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1764 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1768 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1772 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1776 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1760
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2132 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2136 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2140 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2144 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1748 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1752 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1756 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1760 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1744
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2116 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2120 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2124 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2128 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1732 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1736 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1740 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1744 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1728
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2100 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2104 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2108 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2112 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1716 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1720 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1724 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1728 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1712
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2068 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2072 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2076 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2080 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1700 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1704 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1708 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1712 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1696
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2036 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2040 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2044 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2048 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1684 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1688 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1692 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1696 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1680
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2004 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2008 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2012 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2016 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1668 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1672 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1676 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1680 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1664
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1972 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1976 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1980 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1984 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1652 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1656 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1660 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1664 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1648
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1940 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1944 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1948 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1952 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1636 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1640 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1644 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1648 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1632
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1908 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1912 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1916 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1920 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1620 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1624 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1628 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1632 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1616
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1876 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1880 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1884 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1888 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1604 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1608 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1612 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1616 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1600
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1844 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1848 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1852 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1856 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1588 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1592 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1596 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1600 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1584
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1828 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1832 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1836 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1840 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1572 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1576 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1580 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1584 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1568
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1812 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1816 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1820 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1824 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1556 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1560 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1564 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1568 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1552
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1796 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1800 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1804 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1808 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1540 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1544 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1548 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1552 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1536
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1764 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1768 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1772 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1776 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1524 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1528 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1532 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1536 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1520
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1732 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1736 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1740 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1744 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1508 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1512 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1516 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1520 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1504
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1700 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1704 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1708 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1712 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1492 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1496 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1500 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1504 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1488
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1668 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1672 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1676 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1680 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1476 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1480 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1484 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1488 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1472
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1636 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1640 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1644 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1648 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1460 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1464 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1468 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1472 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1456
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1604 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1608 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1612 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1616 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1444 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1448 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1452 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1456 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1440
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1572 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1576 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1580 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1584 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1428 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1432 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1436 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1440 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1424
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1540 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1544 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1548 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1552 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1412 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1416 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1420 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1424 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1408
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1524 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1528 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1532 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1536 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1396 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1400 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1404 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1408 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1392
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1508 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1512 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1516 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1520 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1380 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1384 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1388 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1392 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1376
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1476 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1480 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1484 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1488 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1364 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1368 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1372 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1376 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1360
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1444 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1448 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1452 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1456 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1348 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1352 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1356 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1360 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1344
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1412 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1416 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1420 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1424 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1332 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1336 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1340 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1344 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1328
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1380 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1384 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1388 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1392 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1316 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1320 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1324 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1328 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1312
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1348 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1352 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1356 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1360 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1300 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1304 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1308 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1312 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1296
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1316 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1320 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1324 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1328 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1284 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1288 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1292 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1296 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1280
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1284 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1288 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1292 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1296 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1268 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1272 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1276 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1280 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1264
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1252 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1256 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1260 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1264 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1252 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1256 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1260 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1264 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1248
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1236 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1240 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1244 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1248 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1236 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1240 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1244 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1248 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1232
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1220 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1224 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1228 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1232 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1220 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1224 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1228 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1232 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1216
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1204 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1208 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1212 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1216 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1204 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1208 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1212 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1216 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1200
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1188 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1192 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1196 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1200 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1188 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1192 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1196 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1200 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1184
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1172 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1176 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1180 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1184 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1172 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1176 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1180 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1184 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1168
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1156 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1160 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1164 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1168 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1156 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1160 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1164 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1168 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1152
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1140 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1144 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1148 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1152 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1140 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1144 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1148 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1152 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1136
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1124 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1128 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1132 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1136 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1124 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1128 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1132 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1136 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1120
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1108 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1112 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1116 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1120 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1108 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1112 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1116 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1120 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1104
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1092 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1096 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1100 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1104 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1092 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1096 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1100 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1104 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1088
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1076 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1080 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1084 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1088 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1076 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1080 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1084 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1088 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1072
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1060 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1064 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1068 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1072 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1060 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1064 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1068 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1072 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1056
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1044 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1048 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1052 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1056 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1044 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1048 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1052 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1056 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1040
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1028 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1032 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1036 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1040 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1028 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1032 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1036 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1040 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1024
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1012 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1016 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1020 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1024 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1012 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1016 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1020 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1024 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1008
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:996 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1000 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1004 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1008 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:996 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1000 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1004 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1008 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:992
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:980 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:984 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:988 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:992 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:980 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:984 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:988 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:992 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:976
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:964 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:968 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:972 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:976 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:964 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:968 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:972 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:976 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:960
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:948 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:952 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:956 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:960 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:948 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:952 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:956 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:960 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:944
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:932 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:936 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:940 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:944 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:932 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:936 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:940 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:944 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:928
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:916 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:920 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:924 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:928 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:916 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:920 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:924 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:928 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:912
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:900 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:904 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:908 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:912 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:900 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:904 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:908 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:912 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:896
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:884 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:888 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:892 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:896 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:884 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:888 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:892 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:896 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:880
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:868 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:872 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:876 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:880 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:868 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:872 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:876 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:880 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:864
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:852 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:856 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:860 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:864 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:852 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:856 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:860 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:864 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:848
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:836 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:840 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:844 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:848 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:836 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:840 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:844 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:848 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:832
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:820 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:824 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:828 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:832 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:820 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:824 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:828 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:832 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:816
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:804 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:808 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:812 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:816 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:804 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:808 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:812 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:816 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:788 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:792 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:796 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:800 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:788 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:792 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:796 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:800 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:784
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:772 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:776 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:780 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:784 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:772 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:776 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:780 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:784 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:768
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:756 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:760 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:764 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:768 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:756 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:760 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:764 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:768 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:752
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:740 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:744 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:748 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:752 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:740 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:744 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:748 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:752 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:736
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:724 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:728 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:732 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:736 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:724 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:728 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:732 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:736 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:720
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:708 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:712 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:716 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:720 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:708 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:712 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:716 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:720 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:704
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:692 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:696 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:700 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:704 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:692 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:696 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:700 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:704 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:688
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:676 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:680 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:684 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:688 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:676 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:680 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:684 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:688 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:672
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:660 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:664 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:668 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:672 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:660 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:664 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:668 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:672 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:656
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:644 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:648 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:652 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:656 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:644 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:648 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:652 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:656 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:640
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:628 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:632 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:636 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:640 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:628 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:632 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:636 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:640 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:624
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:612 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:616 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:620 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:624 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:612 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:616 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:620 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:624 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:608
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:596 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:600 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:604 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:608 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:596 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:600 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:604 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:608 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:592
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:580 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:584 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:588 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:592 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:580 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:584 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:588 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:592 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:576
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:564 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:568 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:572 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:576 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:564 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:568 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:572 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:576 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:560
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:548 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:552 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:556 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:560 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:548 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:552 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:556 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:560 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:544
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:532 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:536 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:540 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:544 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:532 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:536 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:540 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:544 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:528
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:516 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:520 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:524 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:528 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:516 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:520 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:524 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:528 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:512
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:500 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:504 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:508 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:512 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:500 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:504 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:508 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:512 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:496
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:484 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:488 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:492 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:496 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:484 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:488 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:492 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:496 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:480
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:468 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:472 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:476 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:480 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:468 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:472 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:476 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:480 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:464
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:452 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:456 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:460 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:464 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:452 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:456 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:460 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:464 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:448
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:436 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:440 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:444 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:448 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:436 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:440 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:444 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:448 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:432
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:420 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:424 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:428 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:432 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:420 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:424 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:428 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:432 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:416
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:404 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:408 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:412 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:416 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:404 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:408 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:412 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:416 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:388 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:392 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:396 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:400 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:388 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:392 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:396 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:400 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:384
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:372 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:376 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:380 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:384 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:372 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:376 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:380 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:384 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:368
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:356 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:360 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:364 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:368 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:356 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:360 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:364 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:368 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:352
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:340 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:344 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:348 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:352 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:340 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:344 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:348 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:352 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:336
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:324 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:328 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:332 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:336 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:324 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:328 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:332 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:336 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:320
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:308 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:312 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:316 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:320 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:308 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:312 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:316 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:320 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:304
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:292 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:296 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:300 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:304 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:292 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:296 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:300 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:304 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:288
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:276 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:280 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:284 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:288 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:276 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:280 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:284 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:288 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:272
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:260 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:264 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:268 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:272 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:260 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:264 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:268 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:272 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:256
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:244 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:248 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:252 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:256 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:244 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:248 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:252 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:256 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:228 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:232 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:236 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:240 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:228 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:232 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:236 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:240 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:212 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:216 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:220 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:224 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:212 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:216 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:220 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:224 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:196 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:200 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:204 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:208 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:196 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:200 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:204 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:208 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:180 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:184 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:188 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:192 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:180 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:184 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:188 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:192 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:164 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:168 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:172 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:176 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:164 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:168 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:172 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:176 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:148 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:152 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:156 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:160 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:148 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:152 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:156 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:160 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:132 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:136 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:140 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:144 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:132 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:136 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:140 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:144 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:116 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:120 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:124 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:128 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:116 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:120 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:124 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:128 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:100 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:104 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:108 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:112 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:100 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:104 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:108 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:112 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:84 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:88 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:92 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:96 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:84 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:88 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:92 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:96 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:80
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:68 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:72 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:76 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:80 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:68 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:72 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:76 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:80 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:52 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:56 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:60 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:64 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:52 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:56 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:60 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:64 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:48
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:36 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:40 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:44 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:48 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:36 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:40 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:44 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:48 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:32
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:20 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:24 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:28 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:32 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:32 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:16
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:16 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
@@ -5009,13 +5002,6 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
 ; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 4
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0x84
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x104
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x184
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x204
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x284
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x304
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x384
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
@@ -5050,1261 +5036,1268 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0x94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xa4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xe4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0x180
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x180
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x104
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x114
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x114
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x124
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x124
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x134
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x134
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x144
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x144
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x154
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x154
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x164
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x164
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x174
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x174
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x200
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x200
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x184
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x194
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x194
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x280
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s7, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x280
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x204
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x214
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x214
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x224
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x224
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x234
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x234
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x244
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x244
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x254
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x254
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x264
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x264
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x274
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x274
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x300
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s8, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x300
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x284
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x294
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x294
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x380
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s9, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x380
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x304
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x314
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x314
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x324
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x324
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x334
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x334
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x344
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x344
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x354
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x354
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x364
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x364
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x374
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x374
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x400
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s10, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x400
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x384
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x394
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x394
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3f4
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3]
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x404
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x404
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x414
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x414
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x424
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x424
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x434
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x434
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x444
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x444
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x454
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x454
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x464
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x464
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x474
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x474
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x484
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x484
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x494
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x494
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x504
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x504
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x514
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x514
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x524
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x524
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x534
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x534
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x544
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x544
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x554
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x554
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x564
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x564
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x574
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x574
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x584
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x584
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x594
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x594
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x604
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x604
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x614
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x614
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x624
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x624
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x634
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x634
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x644
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x644
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:592
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x654
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x654
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:608
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x664
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x664
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:624
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x674
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x674
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:640
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x684
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x684
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:656
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x694
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x694
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:672
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:688
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:704
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:720
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:736
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:752
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:768
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x704
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x704
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:784
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x714
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x714
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:800
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x724
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x724
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:816
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x734
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x734
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:832
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x744
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x744
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:848
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x754
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x754
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:864
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x764
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x764
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:880
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x774
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x774
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:896
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x784
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x784
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:912
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x794
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x794
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:928
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:944
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:960
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:976
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:992
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x804
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x804
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x814
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x814
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x824
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x824
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x834
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x834
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x844
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x844
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x854
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x854
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x864
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x864
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x874
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x874
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x884
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x884
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x894
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x894
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x904
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x904
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x914
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x914
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x924
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x924
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x934
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x934
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x944
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x944
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x954
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x954
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x964
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x964
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x974
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x974
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x984
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x984
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x994
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x994
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xaa4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xaa4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xab4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xab4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xac4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xac4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xad4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xad4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xae4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xae4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xaf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xaf4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xba4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xba4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xbb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xbc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbc4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xbd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbd4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xbe4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbe4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xbf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbf4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2096
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2112
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2128
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2144
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2160
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2176
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2192
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2208
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xca4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xca4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2224
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xcb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2240
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xcc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcc4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2256
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xcd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcd4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2272
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xce4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xce4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2288
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xcf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcf4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2304
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2320
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2336
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2352
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2368
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2384
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2400
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2416
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2432
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2448
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2464
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xda4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xda4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2480
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xdb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2496
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xdc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdc4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2512
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xdd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdd4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2528
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xde4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xde4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2544
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xdf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdf4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2560
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2576
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2592
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2608
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2624
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2640
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2656
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2672
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2688
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2704
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2720
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xea4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xea4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2736
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xeb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xeb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2752
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xec4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xec4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2768
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xed4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xed4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2784
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xee4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xee4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2800
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xef4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xef4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2816
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2832
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2848
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2864
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2880
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2896
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2912
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2928
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2944
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2960
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2976
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xfa4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfa4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2992
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xfb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3008
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xfc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfc4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3024
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xfd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfd4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3040
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xfe4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfe4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3056
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xff4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xff4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3072
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1004
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1004
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3088
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1014
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1014
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3104
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1024
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1024
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3120
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1034
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1034
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3136
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1044
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1044
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3152
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1054
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1054
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3168
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1064
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1064
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3184
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1074
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1074
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3200
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1084
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1084
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3216
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1094
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1094
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3232
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3248
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3264
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3280
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3296
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3312
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3328
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1104
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1104
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3344
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1114
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1114
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3360
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1124
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1124
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3376
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1134
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1134
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3392
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1144
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1144
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3408
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1154
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1154
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3424
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1164
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1164
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3440
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1174
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1174
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3456
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1184
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1184
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3472
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1194
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1194
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3488
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3504
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3520
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3536
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3552
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3568
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3584
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1204
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1204
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3600
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1214
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1214
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3616
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1224
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1224
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3632
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1234
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1234
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3648
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1244
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1244
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3664
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1254
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1254
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3680
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1264
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1264
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3696
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1274
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1274
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3712
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1284
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1284
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3728
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1294
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1294
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3744
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3760
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3776
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3792
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3808
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3824
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3840
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1304
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1304
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3856
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1314
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1314
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3872
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1324
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1324
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3888
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1334
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1334
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3904
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1344
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1344
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3920
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1354
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1354
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3936
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1364
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1364
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3952
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1374
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1374
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1384
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1384
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1394
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1394
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x13a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x13b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x13c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x13d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x13e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4080
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x13e4
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
@@ -7346,7 +7339,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x3f4
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s10, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x400, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x3e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7380,7 +7373,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s9, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x380, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x364
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7414,7 +7407,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s8, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x300, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7448,7 +7441,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s7, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x280, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x264
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7482,7 +7475,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x200, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x1e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7516,7 +7509,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x180, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x164
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7550,7 +7543,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0xe4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 2f43cc022afd33f..fbe0b156cd9baa0 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -15,126 +15,126 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK: bb.0..expVert:
   ; CHECK-NEXT:   liveins: $sgpr3, $sgpr4, $sgpr5, $sgpr8, $sgpr9, $sgpr10, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr25, $sgpr27, $sgpr31
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %56.sub0:sgpr_64 = COPY $sgpr31
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr27
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr25
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr18
-  ; CHECK-NEXT:   undef %50.sub0:sgpr_64 = COPY $sgpr19
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr20
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr21
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr22
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr23
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr9
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8
-  ; CHECK-NEXT:   undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
-  ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr31
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr27
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr25
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr18
+  ; CHECK-NEXT:   undef [[COPY7:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr19
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr20
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr21
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr22
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr23
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr9
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
+  ; CHECK-NEXT:   undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
+  ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   KILL undef %74:sreg_64
   ; CHECK-NEXT:   KILL undef %132:sgpr_128
-  ; CHECK-NEXT:   KILL %130.sub0, %130.sub1
+  ; CHECK-NEXT:   KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   undef %302.sub1:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   KILL undef %89:sgpr_128
   ; CHECK-NEXT:   [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY5]], 64, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY6]], 64, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %54:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %149.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %149.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
-  ; CHECK-NEXT:   %156.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 0, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
+  ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %171:sreg_32, 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %176.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %171:sreg_32, implicit-def $scc
-  ; CHECK-NEXT:   %176.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %183.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %183.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %190.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   %190.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %200.sub0:sreg_64 = S_ADD_U32 %50.sub0, undef %171:sreg_32, implicit-def $scc
-  ; CHECK-NEXT:   %200.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 %50.sub0, 224, implicit-def $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], undef %171:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %171:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_9:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY7]].sub0, 224, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %51:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %210.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %210.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %217.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   %217.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %224.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %224.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 %50.sub0, 576, implicit-def $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_9]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_9]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_9]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_13:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY7]].sub0, 576, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %51:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %241.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %241.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %253.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %253.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %261.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], undef %171:sreg_32, implicit-def $scc
-  ; CHECK-NEXT:   %261.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %273.sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   %286.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %293.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_13]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_13]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_13]], undef %171:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %302, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %302, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %302, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %302, 16, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %312:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %163, 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4)
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %367:sgpr_128, undef %368:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %378:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %362:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %373:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %327.sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %335.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4)
-  ; CHECK-NEXT:   %343.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4)
+  ; CHECK-NEXT:   [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %396:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %394:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -146,119 +146,119 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 160, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_24:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], 160, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %36:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_25:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_24]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_25:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc
-  ; CHECK-NEXT:   %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_26:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_24]], [[S_LSHL_B32_4]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_26:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_27:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]].sub0, 168, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4)
-  ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4)
+  ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc
-  ; CHECK-NEXT:   %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %441, 0, 0 :: (invariant load (s32) from %ir.269, align 8, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_28:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_5]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_28:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_28]], 0, 0 :: (invariant load (s32) from %ir.269, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   %71.sub3:sgpr_128 = S_MOV_B32 553734060
-  ; CHECK-NEXT:   %71.sub2:sgpr_128 = S_MOV_B32 -1
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4)
-  ; CHECK-NEXT:   [[COPY13]].sub1:sgpr_128 = COPY %302.sub1
-  ; CHECK-NEXT:   [[COPY13]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY13]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4)
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_18]], 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_19]], 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %453.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_6]], implicit-def $scc
-  ; CHECK-NEXT:   %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %453, 0, 0 :: (invariant load (s64) from %ir.277, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_29:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_6]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_29:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_29]], 0, 0 :: (invariant load (s64) from %ir.277, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4)
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4)
-  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY14]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-  ; CHECK-NEXT:   [[COPY14]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY14]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_20]], 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4)
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %468.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_7]], implicit-def $scc
-  ; CHECK-NEXT:   %468.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_30:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_7]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_30:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (invariant load (s64) from %ir.287, addrspace 4)
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY15]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
-  ; CHECK-NEXT:   [[COPY15]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4)
-  ; CHECK-NEXT:   KILL %411.sub0, %411.sub1
-  ; CHECK-NEXT:   KILL undef %488:sreg_64
-  ; CHECK-NEXT:   KILL [[COPY15]].sub0_sub1_sub2, [[COPY15]].sub3
-  ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 3, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_30]], 0, 0 :: (invariant load (s64) from %ir.287, addrspace 4)
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_25]], 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %484:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4)
+  ; CHECK-NEXT:   KILL [[S_ADD_U32_25]].sub0, [[S_ADD_U32_25]].sub1
+  ; CHECK-NEXT:   KILL undef %484:sreg_64
+  ; CHECK-NEXT:   KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
+  ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_26]], 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4)
   ; CHECK-NEXT:   [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %485.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc
-  ; CHECK-NEXT:   %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %485, 0, 0 :: (invariant load (s32) from %ir.298, align 8, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_31:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_8]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_31:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_31]], 0, 0 :: (invariant load (s32) from %ir.298, align 8, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM23]]
   ; CHECK-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[COPY16]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
-  ; CHECK-NEXT:   [[COPY16]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 96, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_32:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 96, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %514.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %514, 0, 0 :: (invariant load (s128) from %ir.316, addrspace 4)
-  ; CHECK-NEXT:   undef %522.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   %522.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %522, 0, 0 :: (invariant load (s128) from %ir.321, addrspace 4)
-  ; CHECK-NEXT:   undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_33:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_32]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_33:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_33]], 0, 0 :: (invariant load (s128) from %ir.316, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_34:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_32]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_34:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_34]], 0, 0 :: (invariant load (s128) from %ir.321, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_35:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_32]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_35:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_35]], 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM27]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM25]]
   ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM26]]
+  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM27]]
   ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -363,20 +363,20 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_OR_B32_e64_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_60]], [[V_ADD_U32_e64_25]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_61]], [[V_ADD_U32_e64_26]], implicit $exec
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_63:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_62]], [[V_ADD_U32_e64_27]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %564:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %559:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
   ; CHECK-NEXT:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
   ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
-  ; CHECK-NEXT:   undef %624.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_gfx10 %624, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+  ; CHECK-NEXT:   undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
+  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_gfx10 [[V_CNDMASK_B32_e64_]], undef %573:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
   ; CHECK-NEXT:   S_ENDPGM 0
 .expVert:
   %0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 24319a639da4472..520ec6e24ae3bfe 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1779,30 +1779,29 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[3:4]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index 7201ffaf561662a..20364b57aea5eef 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -14,96 +14,44 @@
 define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) {
 ; CHECK-LABEL: _Z6kernelILi4000ELi1EEvPd:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_mov_b64 s[0:1], 0
-; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
-; CHECK-NEXT:    ; kill: killed $sgpr0_sgpr1
-; CHECK-NEXT:    s_mov_b32 s7, 0x401c0000
-; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 0
-; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s1, 0x40180000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 1
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 2
-; CHECK-NEXT:    s_mov_b32 s1, 0x40220000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
-; CHECK-NEXT:    s_mov_b32 s1, 0x40240000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
-; CHECK-NEXT:    s_mov_b32 s1, 0x40260000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-NEXT:    s_load_dword s0, s[2:3], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; CHECK-NEXT:    s_mov_b32 s2, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    s_mov_b32 s3, 0x40260000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:  .LBB0_1: ; %for.cond4.preheader
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], 0
-; CHECK-NEXT:    s_mov_b32 s2, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0x40140000
-; CHECK-NEXT:    v_writelane_b32 v2, s6, 9
-; CHECK-NEXT:    v_writelane_b32 v2, s7, 10
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 1
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 2
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    s_mov_b32 s1, s7
-; CHECK-NEXT:    s_mov_b32 s0, s2
-; CHECK-NEXT:    v_writelane_b32 v2, s6, 1
-; CHECK-NEXT:    v_writelane_b32 v2, s7, 2
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 9
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 10
-; CHECK-NEXT:    s_mov_b32 s6, s2
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 3
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 4
-; CHECK-NEXT:    s_mov_b32 s3, s1
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s1, s3
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
+; CHECK-NEXT:    s_add_i32 s1, s1, s0
+; CHECK-NEXT:    s_cmpk_lt_i32 s1, 0xa00
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40180000
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x401c0000
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40220000
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40240000
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 5
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 6
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    s_mov_b32 s3, s1
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s1, s3
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 7
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 8
-; CHECK-NEXT:    s_mov_b32 s3, s1
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s1, s3
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
-; CHECK-NEXT:    s_mov_b32 s4, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
-; CHECK-NEXT:    v_readlane_b32 s2, v2, 11
-; CHECK-NEXT:    s_add_i32 s2, s2, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 11
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 11
-; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %for.cond.cleanup.loopexit
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
-; CHECK-NEXT:    ; kill: killed $vgpr2
+; CHECK-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %0 = load i32, ptr addrspace(4) null, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index dcf49de68492405..05de0bc5f282ade 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -39,68 +39,68 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    global_load_dword v0, v42, s[76:77]
+; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[76:77]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS1-NEXT:    s_load_dword s7, s[8:9], 0x20
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, 0x40994400
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s78, 0
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43]
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
 ; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s6, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v2
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS1-NEXT:    s_add_u32 s6, s6, wobble at gotpcrel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s7, s7, wobble at gotpcrel32@hi+12
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v2
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    s_load_dwordx2 s[76:77], s[6:7], 0x0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v2
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[74:75], s[6:7], 0x0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
 ; GLOBALNESS1-NEXT:    s_mov_b32 s70, s16
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS1-NEXT:    s_mov_b32 s71, s15
 ; GLOBALNESS1-NEXT:    s_mov_b32 s72, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[74:75], 0x80
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v1
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr44_vgpr45
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v2
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v3
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v4
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v3
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v2
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[58:59]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[60:61]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_29
 ; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -115,7 +115,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0x80
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0
 ; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS1-NEXT:    buffer_store_dword v42, off, s[0:3], 0
@@ -129,7 +130,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[74:75]
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[46:47]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
@@ -165,12 +166,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[80:81], s[62:63]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[76:77], s[62:63]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[52:53]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -192,16 +193,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[52:53]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
 ; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[60:61]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[48:49]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[48:49]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -216,7 +217,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[56:57]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -230,7 +231,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[74:75]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
@@ -241,7 +242,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[74:75]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
@@ -258,12 +259,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS1-NEXT:  .LBB1_26: ; %Flow24
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[80:81]
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[76:77]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[62:63]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[56:57]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[58:59]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -325,68 +326,68 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    global_load_dword v0, v42, s[72:73]
+; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[72:73]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, 0x40994400
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s74, 0
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43]
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
 ; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v2
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS0-NEXT:    s_add_u32 s6, s6, wobble at gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s7, s7, wobble at gotpcrel32@hi+12
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v2
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    s_load_dwordx2 s[78:79], s[6:7], 0x0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v2
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[76:77], s[6:7], 0x0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
 ; GLOBALNESS0-NEXT:    s_mov_b32 s68, s16
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS0-NEXT:    s_mov_b32 s69, s15
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[76:77], 0x80
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v1
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr44_vgpr45
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v2
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v3
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v4
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v1
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v2
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[58:59]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[60:61]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_29
 ; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -401,7 +402,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0x80
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0
 ; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS0-NEXT:    buffer_store_dword v42, off, s[0:3], 0
@@ -415,7 +417,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[46:47]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
@@ -451,12 +453,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[80:81], s[62:63]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[78:79], s[62:63]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[52:53]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -478,16 +480,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[52:53]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
 ; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[60:61]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[48:49]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[48:49]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -502,7 +504,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[56:57]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -516,7 +518,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
@@ -527,7 +529,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
@@ -544,12 +546,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:  .LBB1_26: ; %Flow24
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[80:81]
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[78:79]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[62:63]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[56:57]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[58:59]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 7aa36a8b377bff5..e809292aad1d38b 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -1165,30 +1165,29 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB9_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 2f82260888a7da0..86e2822a3e5b164 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -33,8 +33,8 @@ define hidden void @widget() {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
-; GCN-NEXT:    s_mov_b64 s[16:17], 0
 ; GCN-NEXT:    s_mov_b64 s[20:21], -1
+; GCN-NEXT:    s_mov_b64 s[16:17], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 21, v0
 ; GCN-NEXT:    s_mov_b64 s[46:47], 0
@@ -303,12 +303,12 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; GCN-NEXT:    s_mov_b64 s[50:51], 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v41
-; GCN-NEXT:    flat_load_dword v44, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v43, 0
+; GCN-NEXT:    flat_load_dword v44, v[0:1]
+; GCN-NEXT:    s_mov_b64 s[50:51], 0
 ; GCN-NEXT:    s_getpc_b64 s[52:53]
 ; GCN-NEXT:    s_add_u32 s52, s52, spam at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s53, s53, spam at rel32@hi+12
@@ -329,10 +329,10 @@ define hidden void @blam() {
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    flat_load_dword v0, v[42:43]
 ; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], 0
-; GCN-NEXT:    s_mov_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 2, v0
-; GCN-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GCN-NEXT:    s_xor_b64 s[56:57], exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_12
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 91d09c01639ffcb..9c316612528c208 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1158,30 +1158,29 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[3:4]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB8_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index f1edd5c74b1054c..0f32eb1b12771fd 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -257,11 +257,10 @@ define amdgpu_kernel void @test_s0_s1_k_f32(ptr addrspace(1) %out, float %a, flo
 
 ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
 ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
-; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK0_SUB1]]]
+; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK0_SUB1]]]
 
-; Same zero component is re-used for half of each immediate.
-; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
-; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK1_SUB1]]]
+; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
+; GCN-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[{{[0-9]+}}:[[VK1_SUB1]]]
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 930ba80ad69638d..0f1ac360096ae02 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -474,7 +474,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT:   successors: %bb.7(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
-  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %51, 0, implicit $exec
+  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %48, 0, implicit $exec
   ; SI-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
   ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
@@ -502,14 +502,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT: bb.5.Flow:
   ; SI-NEXT:   successors: %bb.1(0x40000000), %bb.7(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %52:vgpr_32, %bb.6
+  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %49:vgpr_32, %bb.6
   ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.1
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.6.sw.bb18:
   ; SI-NEXT:   successors: %bb.5(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
+  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
   ; SI-NEXT:   [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
   ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.5



More information about the llvm-commits mailing list