[llvm] [AMDGPU] Support for nested add in GVS pattern matching (PR #186910)

Shilei Tian via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 16 16:00:31 PDT 2026


https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/186910

Fixes ROCM-20181.


>From 106c22eb961283a154b5c542c751ff3febf668d3 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Mon, 16 Mar 2026 18:48:51 -0400
Subject: [PATCH] [AMDGPU] Support for nested add in GVS pattern matching

Fixes ROCM-20181.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  108 +
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  135 +
 llvm/test/CodeGen/AMDGPU/acc-ldst.ll          |    4 +-
 .../llvm.amdgcn.global.load.async.to.lds.ll   |   56 +-
 .../promote-constOffset-to-imm-gfx12.ll       |   37 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      |  984 ++---
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   | 3822 +++++------------
 7 files changed, 1946 insertions(+), 3200 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index cc2058a5a1d4a..11d48cd1ed811 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2056,6 +2056,114 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
       }
     }
 
+    // Try to fold an outer offset into a nested i32 voffset:
+    //   add(add(sgpr, ext(i32 vgpr)), const64)
+    //     -> base = sgpr, voffset = add(i32_vgpr, const32)
+    //   add(add(sgpr, ext(i32 vgpr_a)), ext(i32 vgpr_b))
+    //     -> base = sgpr, voffset = add(i32_vgpr_a, i32_vgpr_b)
+    // Valid only when the i32 addition provably does not overflow.
+    if (!SAddr) {
+      bool IsSigned = Subtarget->hasSignedGVSOffset();
+
+      for (unsigned I = 0; I < 2 && !SAddr; ++I) {
+        SDValue OuterOp = Addr.getOperand(I);
+        SDValue InnerAddr = Addr.getOperand(1 - I);
+
+        if (!InnerAddr->isAnyAdd())
+          continue;
+
+        SDValue OuterI32;
+        KnownBits OuterKnown(32);
+        int64_t SplitImmOffset = 0;
+        int64_t ConstVal = 0;
+        bool IsConst = false;
+
+        if (auto *C = dyn_cast<ConstantSDNode>(OuterOp)) {
+          int64_t OuterConst = C->getSExtValue();
+          if (OuterConst <= 0)
+            continue;
+
+          ConstVal = OuterConst;
+          if (NeedIOffset) {
+            const SIInstrInfo *TII = Subtarget->getInstrInfo();
+            std::tie(SplitImmOffset, ConstVal) = TII->splitFlatOffset(
+                OuterConst, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+          }
+
+          if (!(IsSigned ? isInt<32>(ConstVal) : isUInt<32>(ConstVal)))
+            continue;
+
+          IsConst = true;
+          OuterKnown = KnownBits::makeConstant(APInt(32, ConstVal));
+        } else {
+          OuterI32 = matchExtFromI32orI32(OuterOp, IsSigned, CurDAG);
+          if (!OuterI32)
+            continue;
+          OuterKnown = CurDAG->computeKnownBits(OuterI32);
+        }
+
+        for (unsigned J = 0; J < 2; ++J) {
+          SDValue MaybeBase = InnerAddr.getOperand(J);
+          SDValue MaybeExt = InnerAddr.getOperand(1 - J);
+
+          if (MaybeBase->isDivergent())
+            continue;
+
+          SDValue InnerI32 = matchExtFromI32orI32(MaybeExt, IsSigned, CurDAG);
+          if (!InnerI32)
+            continue;
+
+          KnownBits InnerKnown = CurDAG->computeKnownBits(InnerI32);
+
+          bool NoOverflow = false;
+          if (IsSigned) {
+            bool MinOF = false;
+            bool MaxOF = false;
+            (void)InnerKnown.getSignedMinValue().sadd_ov(
+                OuterKnown.getSignedMinValue(), MinOF);
+            (void)InnerKnown.getSignedMaxValue().sadd_ov(
+                OuterKnown.getSignedMaxValue(), MaxOF);
+            NoOverflow = !MinOF && !MaxOF;
+          } else {
+            bool OF = false;
+            (void)InnerKnown.getMaxValue().uadd_ov(OuterKnown.getMaxValue(),
+                                                   OF);
+            NoOverflow = !OF;
+          }
+
+          if (!NoOverflow)
+            continue;
+
+          SDLoc SL(N);
+          SAddr = MaybeBase;
+
+          if (IsConst && ConstVal == 0) {
+            VOffset = InnerI32;
+          } else {
+            SDValue AddOp =
+                IsConst ? getMaterializedScalarImm32(ConstVal, SL) : OuterI32;
+            if (Subtarget->hasAddNoCarryInsts()) {
+              SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
+              VOffset = SDValue(
+                  CurDAG->getMachineNode(AMDGPU::V_ADD_U32_e64, SL, MVT::i32,
+                                         {InnerI32, AddOp, Clamp}),
+                  0);
+            } else {
+              SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+              SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
+              VOffset =
+                  SDValue(CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, SL,
+                                                 VTs, {InnerI32, AddOp, Clamp}),
+                          0);
+            }
+          }
+
+          ImmOffset = SplitImmOffset;
+          break;
+        }
+      }
+    }
+
     if (SAddr) {
       Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
       return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 81e224355411b..89e7aa0a0a323 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5978,8 +5978,143 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
                  }}};
       }
     }
+
+    // Try to fold an outer offset into a nested i32 voffset:
+    //   G_PTR_ADD(G_PTR_ADD(sgpr, ext(i32 vgpr)), const64)
+    //     -> base = sgpr, voffset = add(i32_vgpr, const32)
+    //   G_PTR_ADD(G_PTR_ADD(sgpr, ext(i32 vgpr_a)), ext(i32 vgpr_b))
+    //     -> base = sgpr, voffset = add(i32_vgpr_a, i32_vgpr_b)
+    // Valid only when the i32 addition provably does not overflow.
+    {
+      bool IsSigned = Subtarget->hasSignedGVSOffset();
+      Register OuterOffset = AddrDef->MI->getOperand(2).getReg();
+      Register InnerPtrAdd =
+          getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
+      auto InnerDef = getDefSrcRegIgnoringCopies(InnerPtrAdd, *MRI);
+
+      if (InnerDef && InnerDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+        Register MaybeBase =
+            getSrcRegIgnoringCopies(InnerDef->MI->getOperand(1).getReg(), *MRI);
+        Register InnerOffset = InnerDef->MI->getOperand(2).getReg();
+
+        if (isSGPR(MaybeBase)) {
+          if (Register InnerI32 =
+                  matchExtendFromS32OrS32(InnerOffset, IsSigned)) {
+            Register OuterI32;
+            KnownBits OuterKnown(32);
+            int64_t SplitImmOffset = 0;
+            int64_t ConstVal = 0;
+            bool IsConst = false;
+            bool OuterMatched = false;
+
+            auto OuterConst =
+                getIConstantVRegValWithLookThrough(OuterOffset, *MRI);
+            if (OuterConst && OuterConst->Value.getSExtValue() > 0) {
+              int64_t OuterConstVal = OuterConst->Value.getSExtValue();
+              ConstVal = OuterConstVal;
+              if (NeedIOffset) {
+                std::tie(SplitImmOffset, ConstVal) =
+                    TII.splitFlatOffset(OuterConstVal, AMDGPUAS::GLOBAL_ADDRESS,
+                                        SIInstrFlags::FlatGlobal);
+              }
+
+              if (IsSigned ? isInt<32>(ConstVal) : isUInt<32>(ConstVal)) {
+                IsConst = true;
+                OuterKnown = KnownBits::makeConstant(APInt(32, ConstVal));
+                OuterMatched = true;
+              }
+            } else if ((OuterI32 =
+                            matchExtendFromS32OrS32(OuterOffset, IsSigned))) {
+              OuterKnown = VT->getKnownBits(OuterI32);
+              OuterMatched = true;
+            }
+
+            if (OuterMatched) {
+              KnownBits InnerKnown = VT->getKnownBits(InnerI32);
+
+              bool NoOverflow = false;
+              if (IsSigned) {
+                bool MinOF, MaxOF;
+                (void)InnerKnown.getSignedMinValue().sadd_ov(
+                    OuterKnown.getSignedMinValue(), MinOF);
+                (void)InnerKnown.getSignedMaxValue().sadd_ov(
+                    OuterKnown.getSignedMaxValue(), MaxOF);
+                NoOverflow = !MinOF && !MaxOF;
+              } else {
+                bool OF;
+                (void)InnerKnown.getMaxValue().uadd_ov(OuterKnown.getMaxValue(),
+                                                       OF);
+                NoOverflow = !OF;
+              }
+
+              if (NoOverflow) {
+                MachineInstr *MI = Root.getParent();
+                MachineBasicBlock *MBB = MI->getParent();
+                const DebugLoc &DL = MI->getDebugLoc();
+
+                Register VOffsetReg;
+                if (IsConst && ConstVal == 0) {
+                  VOffsetReg = InnerI32;
+                } else {
+                  Register AddOpReg;
+                  if (IsConst) {
+                    AddOpReg =
+                        MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+                    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32),
+                            AddOpReg)
+                        .addImm(ConstVal);
+                  } else {
+                    AddOpReg = OuterI32;
+                  }
+
+                  VOffsetReg =
+                      MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+                  if (STI.hasAddNoCarryInsts()) {
+                    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ADD_U32_e64),
+                            VOffsetReg)
+                        .addReg(InnerI32)
+                        .addReg(AddOpReg)
+                        .addImm(0);
+                  } else {
+                    Register UnusedCarry =
+                        MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
+                    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64),
+                            VOffsetReg)
+                        .addDef(UnusedCarry, RegState::Dead)
+                        .addReg(InnerI32)
+                        .addReg(AddOpReg)
+                        .addImm(0);
+                  }
+                }
+
+                if (NeedIOffset)
+                  return {{
+                      [=](MachineInstrBuilder &MIB) { MIB.addReg(MaybeBase); },
+                      [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffsetReg); },
+                      [=](MachineInstrBuilder &MIB) {
+                        MIB.addImm(SplitImmOffset);
+                      },
+                      [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
+                  }};
+                return {{
+                    [=](MachineInstrBuilder &MIB) { MIB.addReg(MaybeBase); },
+                    [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffsetReg); },
+                    [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
+                }};
+              }
+            }
+          }
+        }
+      }
+    }
   }
 
+  // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
+  // drop this.
+  if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
+      AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
+    return std::nullopt;
+
   // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
   // drop this.
   if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 4258d1d4bd874..7f3603435d147 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -63,7 +63,7 @@ bb:
 ; GCN-LABEL: {{^}}test_load_store:
 ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-NOT:     v_accvgpr
-; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}]
 define amdgpu_kernel void @test_load_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -171,7 +171,7 @@ bb:
 ; GCN-NOT:     v_accvgpr_write
 ; GCN:         v_mfma_f32_32x32x1f32
 ; GCN-NOT:     v_accvgpr_read
-; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
+; GCN-COUNT-16: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
 define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
index bf7cce9877f86..407940631edc0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
@@ -10,14 +10,14 @@ declare void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gaddr,
 define amdgpu_ps void @global_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b8_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b8 v2, v[0:1], off offset:16 th:TH_LOAD_NT
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b8_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -32,7 +32,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b8_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b8 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -45,14 +45,14 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b32_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b32 v2, v[0:1], off offset:16 th:TH_LOAD_HT scope:SCOPE_SE
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b32_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -67,7 +67,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b32_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b32_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -80,14 +80,14 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b64 v2, v[0:1], off offset:16 th:TH_LOAD_NT_HT scope:SCOPE_DEV
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -102,7 +102,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -115,14 +115,14 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b128_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b128 v2, v[0:1], off offset:16 th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b128_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -137,7 +137,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b128_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b128 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -150,7 +150,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_load_async_to_lds_b32_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -163,7 +163,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -176,7 +176,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
@@ -189,3 +189,29 @@ entry:
   call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1)
   ret void
 }
+
+; Verify that nested ptradd(ptradd(sgpr, zext(i32_vgpr)), const) is
+; folded to GVS mode when KnownBits proves the i32 addition won't overflow.
+; This pattern arises from SLSR transforming independent address computations
+; into incremental offsets from a shared base.
+define amdgpu_ps void @global_load_async_to_lds_b128_saddr_nested_const(ptr addrspace(1) inreg %base, ptr addrspace(3) %lds, i32 %tid_raw) {
+; GFX1250-LABEL: global_load_async_to_lds_b128_saddr_nested_const:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
+; GFX1250-NEXT:    global_load_async_to_lds_b128 v0, v1, s[0:1]
+; GFX1250-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
+; GFX1250-NEXT:    global_load_async_to_lds_b128 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+entry:
+  %tid = and i32 %tid_raw, 1023
+  %off = add i32 %tid, 256
+  %zext = zext i32 %off to i64
+  %gep0 = getelementptr i8, ptr addrspace(1) %base, i64 %zext
+  call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gep0, ptr addrspace(3) %lds, i32 0, i32 0)
+  %gep1 = getelementptr i8, ptr addrspace(1) %gep0, i64 256
+  call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gep1, ptr addrspace(3) %lds, i32 0, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
index e52a9739c45f4..931263bd5712a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
@@ -10,20 +10,19 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 define amdgpu_kernel void @promote_async_load_offset_negative(ptr addrspace(1) %src) {
 ; GFX1250-LABEL: promote_async_load_offset_negative:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, 0x100, v0
+; GFX1250-NEXT:    v_add_nc_u32_e32 v2, 0x100, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_load_async_to_lds_b128 v1, v0, s[0:1]
+; GFX1250-NEXT:    global_load_async_to_lds_b128 v1, v2, s[0:1]
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
 ; GFX1250-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff00
-; GFX1250-NEXT:    v_add_nc_u32_e64 v0, 0xfffffe00, 0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[0:1], v[2:3]
-; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_load_async_to_lds_b128 v0, v[2:3], off offset:512
 ; GFX1250-NEXT:    global_load_async_to_lds_b128 v1, v[2:3], off
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -56,19 +55,16 @@ entry:
 define amdgpu_kernel void @promote_async_load_offset_positive(ptr addrspace(1) %src) {
 ; GFX1250-LABEL: promote_async_load_offset_positive:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v2, 0x100, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_load_async_to_lds_b128 v1, v0, s[0:1]
-; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
-; GFX1250-NEXT:    v_add_nc_u32_e64 v0, 0xffffff00, 0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], 0x100, v[2:3]
-; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_load_async_to_lds_b128 v1, v[2:3], off
-; GFX1250-NEXT:    global_load_async_to_lds_b128 v0, v[2:3], off offset:256
+; GFX1250-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
+; GFX1250-NEXT:    global_load_async_to_lds_b128 v1, v2, s[0:1]
+; GFX1250-NEXT:    global_load_async_to_lds_b128 v1, v0, s[0:1]
 ; GFX1250-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -102,20 +98,19 @@ entry:
 define amdgpu_kernel void @promote_async_store_offset_negative(ptr addrspace(1) %dst) {
 ; GFX1250-LABEL: promote_async_store_offset_negative:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24 nv
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, 0x100, v0
+; GFX1250-NEXT:    v_add_nc_u32_e32 v2, 0x100, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_store_async_from_lds_b128 v0, v1, s[0:1]
+; GFX1250-NEXT:    global_store_async_from_lds_b128 v2, v1, s[0:1]
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
 ; GFX1250-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff00
-; GFX1250-NEXT:    v_add_nc_u32_e64 v0, 0xfffffe00, 0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[0:1], v[2:3]
-; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_store_async_from_lds_b128 v[2:3], v0, off offset:512
 ; GFX1250-NEXT:    global_store_async_from_lds_b128 v[2:3], v1, off
 ; GFX1250-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index a2b0f4d56ebea..f5aff758b60dc 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -111,53 +111,53 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v18
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v16
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v14, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v0, v16, v0
 ; GFX9-NEXT:    s_movk_i32 s1, 0x2000
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s1, v14
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v15, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:-4096
 ; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
-; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s0, v14
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v15, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:2048
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v14
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v15, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off offset:2048
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v16, v[0:1], s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: clmem_read_simplified:
@@ -186,46 +186,47 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 0x2000
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v20, v0
+; GFX10-NEXT:    v_add_co_u32 v16, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, 0x1000
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v17, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[34:35]
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v16, 0x2000
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v17, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off
-; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x3000
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, 0x3000
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v17, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[8:9], off
-; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off offset:-2048
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[6:7], off
+; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[0:1], off offset:-2048
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, 0x3800, v16
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v17, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
-; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[6:7], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v14, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v18, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
@@ -246,64 +247,64 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff8000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v16
+; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v14
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2048
-; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 0x2000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, v14, v0
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, v2, vcc_lo
+; GFX11-NEXT:    global_load_b64 v[0:1], v3, s[34:35]
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, 0x2000
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v11, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x1000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v11, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[10:11], v[6:7], off offset:-4096
-; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off offset:2048
-; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, 0x2000, v0
-; GFX11-NEXT:    global_load_b64 v[6:7], v[6:7], off
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:-4096
+; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off offset:2048
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x2000, v10
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v11, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0x3000, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[12:13], v[12:13], off offset:2048
-; GFX11-NEXT:    global_load_b64 v[14:15], v[0:1], off
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[12:13], v[10:11], off
+; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v7, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v5, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v7, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v9, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v15, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v13, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
-; GFX11-NEXT:    global_store_b64 v16, v[0:1], s[34:35]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v11, v1, vcc_lo
+; GFX11-NEXT:    global_store_b64 v14, v[0:1], s[34:35]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1131,33 +1132,30 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:1024
-; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:2048
-; GFX9-NEXT:    global_load_dword v8, v[0:1], off offset:3072
-; GFX9-NEXT:    global_load_dword v9, v[2:3], off
-; GFX9-NEXT:    global_load_dword v10, v[2:3], off offset:1024
-; GFX9-NEXT:    global_load_dword v11, v[2:3], off offset:2048
-; GFX9-NEXT:    global_load_dword v12, v[2:3], off offset:3072
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v2, v[0:1], off
-; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:1024
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NEXT:    v_add_u32_e32 v0, v6, v5
+; GFX9-NEXT:    s_movk_i32 s0, 0x1000
+; GFX9-NEXT:    v_add_u32_e32 v5, v4, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x2000, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_dword v6, v5, s[34:35]
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off
+; GFX9-NEXT:    global_load_dword v8, v[0:1], off offset:1024
+; GFX9-NEXT:    global_load_dword v9, v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dword v10, v[0:1], off offset:3072
+; GFX9-NEXT:    global_load_dword v11, v[2:3], off
+; GFX9-NEXT:    global_load_dword v12, v[2:3], off offset:1024
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add3_u32 v0, v7, v0, v8
+; GFX9-NEXT:    v_add_u32_e32 v0, v6, v6
+; GFX9-NEXT:    v_add3_u32 v0, v6, v0, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
+; GFX9-NEXT:    v_add3_u32 v0, v7, v0, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add3_u32 v0, v11, v0, v12
+; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v3
+; GFX9-NEXT:    v_add3_u32 v0, v11, v0, v12
 ; GFX9-NEXT:    global_store_dword v4, v0, s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1183,48 +1181,48 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 2
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff8000, v1
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v10
+; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x1000
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dword v11, v[0:1], off
-; GFX10-NEXT:    global_load_dword v12, v[0:1], off offset:1024
-; GFX10-NEXT:    global_load_dword v13, v[4:5], off offset:-2048
-; GFX10-NEXT:    global_load_dword v14, v[2:3], off offset:1024
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, 0x1000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 0x2000
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v8, v0
+; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v9
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v9, 0x1000
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v10, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x1000, v9
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    global_load_dword v15, v[4:5], off
-; GFX10-NEXT:    global_load_dword v16, v[6:7], off offset:1024
-; GFX10-NEXT:    global_load_dword v17, v[2:3], off offset:1024
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    global_load_dword v2, v[8:9], off offset:-2048
-; GFX10-NEXT:    global_load_dword v3, v[8:9], off
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:1024
+; GFX10-NEXT:    global_load_dword v11, v6, s[34:35]
+; GFX10-NEXT:    global_load_dword v12, v[2:3], off offset:-2048
+; GFX10-NEXT:    global_load_dword v13, v[0:1], off offset:1024
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v9, 0x2000
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, 0x1800, v9
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v10, vcc_lo
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dword v14, v[2:3], off
+; GFX10-NEXT:    global_load_dword v15, v[4:5], off offset:1024
+; GFX10-NEXT:    global_load_dword v16, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v17, v[6:7], off offset:1024
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x2000, v9
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v10, vcc_lo
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:1024
 ; GFX10-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v12, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v11, v11
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_add3_u32 v0, v13, v0, v14
+; GFX10-NEXT:    v_add3_u32 v0, v12, v0, v13
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_add3_u32 v0, v15, v0, v16
+; GFX10-NEXT:    v_add3_u32 v0, v14, v0, v15
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_add3_u32 v0, v2, v0, v17
+; GFX10-NEXT:    v_add3_u32 v0, v16, v0, v17
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add3_u32 v0, v3, v0, v4
-; GFX10-NEXT:    global_store_dword v10, v0, s[34:35]
+; GFX10-NEXT:    v_add3_u32 v0, v4, v0, v5
+; GFX10-NEXT:    global_store_dword v8, v0, s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: Address32:
@@ -1246,42 +1244,38 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v6
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, v6, v0
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v2, vcc_lo
+; GFX11-NEXT:    global_load_b32 v7, v3, s[34:35]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, 0x2000
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v5, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v7, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v8, v[0:1], off offset:1024
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x2000
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    global_load_b32 v9, v[0:1], off offset:2048
-; GFX11-NEXT:    global_load_b32 v10, v[0:1], off offset:3072
-; GFX11-NEXT:    global_load_b32 v11, v[4:5], off offset:-4096
-; GFX11-NEXT:    global_load_b32 v12, v[2:3], off offset:1024
-; GFX11-NEXT:    global_load_b32 v13, v[2:3], off offset:2048
+; GFX11-NEXT:    global_load_b32 v8, v[0:1], off offset:-4096
+; GFX11-NEXT:    global_load_b32 v9, v[2:3], off offset:1024
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x2000, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_load_b32 v10, v[2:3], off offset:2048
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:3072
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v3, v[4:5], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:1024
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v8, v7
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[4:5], off offset:1024
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, v7, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v9, v1, v10
+; GFX11-NEXT:    v_add3_u32 v3, v7, v3, v7
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add3_u32 v1, v11, v1, v12
+; GFX11-NEXT:    v_add3_u32 v3, v8, v3, v9
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v13, v1, v2
+; GFX11-NEXT:    v_add3_u32 v2, v10, v3, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add3_u32 v0, v3, v1, v0
+; GFX11-NEXT:    v_add3_u32 v0, v0, v2, v1
 ; GFX11-NEXT:    global_store_b32 v6, v0, s[34:35]
 ; GFX11-NEXT:    s_endpgm
 entry:
@@ -1387,54 +1381,55 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: Offset64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff8000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v10
-; GFX9-NEXT:    v_mov_b32_e32 v3, 3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0xf000
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:2048
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v5, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v3, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[34:35]
-; GFX9-NEXT:    s_endpgm
+; GFX900-LABEL: Offset64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX900-NEXT:    s_mov_b32 s38, -1
+; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX900-NEXT:    s_add_u32 s36, s36, s11
+; GFX900-NEXT:    s_addc_u32 s37, s37, 0
+; GFX900-NEXT:    s_getpc_b64 s[0:1]
+; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX900-NEXT:    v_mov_b32_e32 v31, v0
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_mov_b32 s32, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff8000, v1
+; GFX900-NEXT:    v_mov_b32_e32 v1, s35
+; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v9
+; GFX900-NEXT:    v_mov_b32_e32 v3, 3
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX900-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v1, vcc
+; GFX900-NEXT:    s_movk_i32 s0, 0xf000
+; GFX900-NEXT:    v_add_u32_e32 v11, v9, v3
+; GFX900-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v0
+; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v10, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[3:4], v11, s[34:35]
+; GFX900-NEXT:    global_load_dwordx2 v[5:6], v[1:2], off
+; GFX900-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:2048
+; GFX900-NEXT:    v_add_u32_e32 v1, 1, v10
+; GFX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
+; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
+; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
+; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX900-NEXT:    global_store_dwordx2 v9, v[0:1], s[34:35]
+; GFX900-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: Offset64:
 ; GFX10:       ; %bb.0: ; %entry
@@ -1459,17 +1454,18 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v12
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, s34, v12
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v12, v1
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0xfffff800
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v8, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v1, s[34:35]
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v8
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off
@@ -1485,6 +1481,56 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
+; GFX90A-LABEL: Offset64:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT:    s_mov_b32 s38, -1
+; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX90A-NEXT:    s_add_u32 s36, s36, s11
+; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
+; GFX90A-NEXT:    s_getpc_b64 s[0:1]
+; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj at gotpcrel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 s32, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff8000, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s35
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v10
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    s_movk_i32 s0, 0xf000
+; GFX90A-NEXT:    v_add_u32_e32 v11, v10, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v11, s[34:35]
+; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:2048
+; GFX90A-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX90A-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX90A-NEXT:    s_waitcnt vmcnt(2)
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v5, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v3, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-NEXT:    global_store_dwordx2 v10, v[0:1], s[34:35]
+; GFX90A-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: Offset64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
@@ -1500,21 +1546,23 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v8
+; GFX11-NEXT:    v_add_co_u32 v0, s0, s34, v8
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v1
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, v8, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v9, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[4:5], v1, s[34:35]
 ; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 1, v9
+; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:2048
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
@@ -1631,28 +1679,29 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s0, 0x7ffff000
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0x80000000, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dword v7, v[0:1], off
-; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:2048
-; GFX9-NEXT:    global_load_dword v9, v[2:3], off offset:3072
-; GFX9-NEXT:    global_load_dword v10, v[4:5], off
+; GFX9-NEXT:    s_mov_b32 s0, 0x7ffff000
+; GFX9-NEXT:    v_add_u32_e32 v5, v4, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x80000000, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    global_load_dword v6, v5, s[34:35]
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dword v8, v[0:1], off offset:3072
+; GFX9-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_u32_e32 v0, v8, v7
+; GFX9-NEXT:    v_add_u32_e32 v0, v7, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
-; GFX9-NEXT:    global_store_dword v6, v0, s[34:35]
+; GFX9-NEXT:    v_add3_u32 v0, v8, v0, v9
+; GFX9-NEXT:    global_store_dword v4, v0, s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: p32Offset64:
@@ -1677,26 +1726,27 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v6
+; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x80000000
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7ffff800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v4, v0
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v3, 0x80000000
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x7ffff800, v3
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v5, vcc_lo
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dword v7, v[0:1], off
-; GFX10-NEXT:    global_load_dword v8, v[2:3], off offset:-2048
-; GFX10-NEXT:    global_load_dword v9, v[2:3], off
-; GFX10-NEXT:    global_load_dword v10, v[4:5], off offset:1024
+; GFX10-NEXT:    global_load_dword v5, v6, s[34:35]
+; GFX10-NEXT:    global_load_dword v7, v[0:1], off offset:-2048
+; GFX10-NEXT:    global_load_dword v8, v[0:1], off
+; GFX10-NEXT:    global_load_dword v9, v[2:3], off offset:1024
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v8, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v7, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add3_u32 v0, v10, v0, v9
-; GFX10-NEXT:    global_store_dword v6, v0, s[34:35]
+; GFX10-NEXT:    v_add3_u32 v0, v9, v0, v8
+; GFX10-NEXT:    global_store_dword v4, v0, s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: p32Offset64:
@@ -1713,31 +1763,32 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v6
+; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x7ffff000, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x80000000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, v4, v0
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ffff000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x80000000, v3
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v5, vcc_lo
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:2048
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:3072
-; GFX11-NEXT:    global_load_b32 v3, v[4:5], off
+; GFX11-NEXT:    global_load_b32 v5, v6, s[34:35]
+; GFX11-NEXT:    global_load_b32 v6, v[0:1], off offset:2048
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:3072
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v1, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, v6, v5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, v2, v0, v3
-; GFX11-NEXT:    global_store_b32 v6, v0, s[34:35]
+; GFX11-NEXT:    v_add3_u32 v0, v0, v2, v1
+; GFX11-NEXT:    global_store_b32 v4, v0, s[34:35]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %call = tail call i64 @_Z13get_global_idj(i32 0)
@@ -1852,42 +1903,31 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff8000, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s37
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s36, v16
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s39
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s38, v16
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x2000, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x3000, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v11, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off
-; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off offset:2048
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff8000, v0
+; GFX9-NEXT:    v_add_u32_e32 v4, 0x1000, v12
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[36:37]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[36:37] offset:2048
+; GFX9-NEXT:    v_add_u32_e32 v14, 0x3000, v12
+; GFX9-NEXT:    v_add_u32_e32 v13, 0x2000, v12
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v14, s[38:39]
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v13, s[38:39] offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v13, s[36:37]
+; GFX9-NEXT:    global_load_dwordx2 v[10:11], v14, s[38:39] offset:2048
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v13, v11, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v15, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx2 v16, v[0:1], s[36:37]
+; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[36:37]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: DiffBase:
@@ -1911,42 +1951,36 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff8000, v0
-; GFX10-NEXT:    v_add_co_u32 v8, s0, s36, v16
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s0, s37, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v12, s0, s38, v16
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v13, s0, s39, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, 0x1800
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v12, 0x3000
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff8000, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x1000, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 0x1800, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 0x2800, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x3000, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v13, 0x2000, v12
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v8
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[36:37]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v5, s[36:37]
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:-2048
-; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v12
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo
-; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v8, s[38:39]
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v9, s[38:39]
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, 0x3800, v12
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v13, s[36:37]
+; GFX10-NEXT:    global_load_dwordx2 v[10:11], v14, s[38:39]
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v8
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v9, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v7, v5, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v16, v[0:1], s[36:37]
+; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[36:37]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: DiffBase:
@@ -1963,41 +1997,31 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff8000, v0
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s36, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s37, 0, s0
-; GFX11-NEXT:    v_add_co_u32 v8, s0, s38, v12
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 0x2000
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, s39, 0, s0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x2000, v8
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v9, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x3000, v8
-; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:-4096
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x1000, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x3000, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x2000, v12
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off offset:2048
-; GFX11-NEXT:    global_load_b64 v[10:11], v[8:9], off
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[36:37]
+; GFX11-NEXT:    global_load_b64 v[2:3], v2, s[36:37] offset:2048
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_load_b64 v[4:5], v10, s[38:39]
+; GFX11-NEXT:    global_load_b64 v[6:7], v8, s[38:39] offset:2048
+; GFX11-NEXT:    global_load_b64 v[8:9], v8, s[36:37]
+; GFX11-NEXT:    global_load_b64 v[10:11], v10, s[38:39] offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v6
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v7, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v10, v4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, v11, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v5, v7, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v9, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v9, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
@@ -2142,52 +2166,51 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v22
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v16
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v14, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v1, vcc
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT:    v_add_u32_e32 v6, v16, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v14
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v15, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[34:35]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
 ; GFX9-NEXT:    s_movk_i32 s0, 0x2000
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s0, v14
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v15, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off
+; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[6:7], off offset:2048
 ; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
-; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off
-; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
-; GFX9-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v14
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v15, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v16, v[0:1], s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ReverseOrder:
@@ -2216,47 +2239,48 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, v20, v0
+; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3800, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3000, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v8, s[34:35]
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x2800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, 0x2000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, 0x1800, v0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2800, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, 0x2000, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, 0x1800, v14
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off
-; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, 0x1000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, 0x1000, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, 0x800, v14
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v15, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[14:15], off
-; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[16:17], v[12:13], off
+; GFX10-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v6
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v7, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v16, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
@@ -2280,60 +2304,60 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff8000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v16
+; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v14
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3000, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, 0x2000, v0
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT:    global_load_b64 v[6:7], v[2:3], off offset:2048
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, v14, v0
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v11, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x2000, v10
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v11, vcc_lo
 ; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    global_load_b64 v[12:13], v[8:9], off offset:2048
-; GFX11-NEXT:    global_load_b64 v[14:15], v[10:11], off
-; GFX11-NEXT:    global_load_b64 v[8:9], v[8:9], off
-; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off offset:2048
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, v7, v5, vcc_lo
+; GFX11-NEXT:    global_load_b64 v[4:5], v4, s[34:35]
+; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:2048
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, 0x1000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[12:13], v[10:11], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[10:11], v[10:11], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v12, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v5, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v7, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v9, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v10, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v14, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v12, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v15, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v13, v1, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v10, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v11, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
-; GFX11-NEXT:    global_store_b64 v16, v[0:1], s[34:35]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v5, v1, vcc_lo
+; GFX11-NEXT:    global_store_b64 v14, v[0:1], s[34:35]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %call = tail call i64 @_Z13get_global_idj(i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index b5474b8974b29..ec450a082fb2e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -4674,1305 +4674,1029 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    v_lshlrev_b32_e32 v5, 13, v0
 ; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v5
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v0, vcc
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x80, v2
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 20
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 36
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 52
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x44
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x54
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x64
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v2
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x74
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x84
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x94
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x180, v2
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x104
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x114
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x124
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x134
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x144
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x154
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x164
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x200, v2
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x174
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x184
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x194
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1a4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1b4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1c4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1d4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1e4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x280, v2
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1f4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x204
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x214
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x224
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x234
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x244
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x254
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x264
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x300, v2
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x274
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x284
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x294
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2a4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2b4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2c4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2d4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2e4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x380, v2
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2f4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x304
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x314
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x324
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x334
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x344
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x354
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x364
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x400, v2
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x374
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x384
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x394
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3a4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3b4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3c4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3d4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3e4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3f4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3]
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x404
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x414
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 16
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x424
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 32
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x434
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 48
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x444
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x454
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x464
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x474
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x484
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x494
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x504
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x100
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x514
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x110
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x524
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x120
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x534
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x130
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x544
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x140
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x554
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x150
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x564
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x160
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x574
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x170
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x584
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x180
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x594
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x190
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x604
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x200
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x614
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x210
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x624
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x220
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x634
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x230
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x644
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x240
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:592
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x654
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x250
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:608
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x664
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x260
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:624
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x674
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x270
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:640
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x684
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x280
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:656
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x694
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x290
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:672
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:688
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:704
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:720
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:736
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:752
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:768
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x704
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x300
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:784
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x714
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x310
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:800
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x724
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x320
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:816
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x734
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x330
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:832
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x744
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x340
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:848
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x754
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x350
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:864
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x764
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x360
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:880
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x774
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x370
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:896
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x784
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x380
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:912
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x794
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x390
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:928
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:944
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:960
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:976
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:992
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x804
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x400
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x814
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x410
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x824
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x420
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x834
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x430
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x844
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x440
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x854
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x450
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x864
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x460
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x874
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x470
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x884
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x480
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x894
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x490
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x904
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x500
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x914
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x510
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x924
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x520
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x934
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x530
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x944
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x540
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x954
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x550
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x964
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x560
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x974
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x570
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x984
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x580
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x994
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x590
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x600
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x610
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x620
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x630
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x640
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x650
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x660
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x670
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x680
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x690
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xaa4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xab4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xac4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xad4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xae4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xaf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x700
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x710
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x720
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x730
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x740
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x750
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x760
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x770
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x780
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x790
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xba4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbe4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x800
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x810
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x820
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2096
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x830
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2112
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x840
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2128
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x850
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2144
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x860
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2160
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x870
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2176
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x880
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2192
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x890
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2208
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xca4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2224
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2240
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2256
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2272
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xce4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2288
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2304
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x900
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2320
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x910
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2336
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x920
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2352
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x930
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2368
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x940
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2384
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x950
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2400
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x960
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2416
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x970
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2432
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x980
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2448
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x990
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2464
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xda4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2480
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2496
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2512
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2528
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xde4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2544
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2560
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2576
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2592
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2608
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2624
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2640
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2656
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2672
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2688
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2704
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2720
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xea4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xaa0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2736
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xeb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xab0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2752
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xec4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xac0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2768
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xed4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xad0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2784
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xee4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xae0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2800
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xef4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xaf0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2816
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2832
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2848
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2864
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2880
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2896
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2912
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2928
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2944
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2960
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2976
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfa4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xba0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2992
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3008
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbc0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3024
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbd0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3040
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfe4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbe0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3056
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xff4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbf0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3072
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1004
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3088
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1014
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3104
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1024
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3120
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1034
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3136
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1044
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3152
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1054
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3168
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1064
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3184
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1074
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3200
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1084
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3216
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1094
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3232
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xca0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3248
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3264
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcc0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3280
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcd0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3296
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xce0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3312
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcf0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3328
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1104
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3344
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1114
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3360
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1124
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3376
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1134
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3392
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1144
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3408
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1154
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3424
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1164
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3440
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1174
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3456
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1184
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3472
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1194
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3488
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xda0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3504
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3520
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdc0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3536
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdd0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3552
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xde0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3568
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdf0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3584
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1204
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3600
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1214
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3616
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1224
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3632
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1234
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3648
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1244
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3664
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1254
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3680
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1264
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3696
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1274
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3712
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1284
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3728
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1294
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3744
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xea0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3760
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xeb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3776
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xec0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3792
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xed0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3808
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xee0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3824
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xef0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3840
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1304
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3856
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1314
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3872
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1324
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3888
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1334
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3904
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1344
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3920
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1354
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3936
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1364
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3952
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1374
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1384
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1394
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfa0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfc0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfd0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfe0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x13e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xfe0
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
@@ -5987,1301 +5711,1027 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v5
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:4080
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x13d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xfd0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:4064
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x13c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xfc0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:4048
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x13b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xfb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:4032
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x13a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xfa0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:4016
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1394
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:4000
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1384
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3984
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1374
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1364
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3952
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1354
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3936
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1344
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3920
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1334
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3904
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1324
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3888
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1314
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3872
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1304
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3856
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x12f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xef0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3840
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x12e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xee0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3824
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x12d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xed0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3808
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x12c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xec0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3792
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x12b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xeb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3776
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x12a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xea0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3760
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1294
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3744
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1284
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3728
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1274
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3712
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1264
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3696
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1254
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3680
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1244
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3664
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1234
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3648
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1224
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3632
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1214
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3616
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1204
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3600
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x11f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xdf0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3584
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x11e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xde0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3568
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x11d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xdd0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3552
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x11c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xdc0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3536
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x11b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xdb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3520
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x11a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xda0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3504
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1194
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3488
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1184
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3472
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1174
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3456
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1164
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3440
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1154
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3424
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1144
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3408
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1134
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3392
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1124
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3376
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1114
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3360
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1104
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3344
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x10f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xcf0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3328
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x10e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xce0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3312
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x10d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xcd0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3296
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x10c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xcc0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3280
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x10b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xcb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3264
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x10a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xca0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3248
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1094
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3232
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1084
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3216
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1074
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3200
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1064
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3184
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1054
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3168
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1044
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3152
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1034
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3136
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1024
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3120
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1014
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3104
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1004
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3088
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xff4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xbf0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3072
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xfe4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xbe0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3056
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xfd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xbd0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3040
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xfc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xbc0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3024
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xfb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xbb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:3008
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xfa4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xba0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2992
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2976
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2960
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2944
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2928
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2912
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2896
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2880
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2864
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2848
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2832
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xef4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xaf0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2816
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xee4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xae0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2800
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xed4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xad0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2784
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xec4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xac0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2768
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xeb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xab0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2752
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xea4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xaa0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2736
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2720
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2704
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2688
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2672
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2656
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa40
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2640
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa30
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2624
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa20
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2608
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa10
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2592
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa00
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2576
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xdf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2560
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xde4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2544
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xdd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2528
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xdc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2512
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xdb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2496
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xda4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2480
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x990
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2464
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x980
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2448
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x970
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2432
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x960
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2416
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x950
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2400
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x940
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2384
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x930
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2368
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x920
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2352
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x910
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2336
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x900
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2320
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xcf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2304
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xce4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2288
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xcd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2272
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xcc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2256
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xcb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2240
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xca4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2224
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x890
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2208
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x880
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2192
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x870
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2176
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x860
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2160
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x850
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2144
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x840
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2128
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x830
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2112
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x820
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2096
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x810
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2080
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x800
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2064
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xbf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2048
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xbe4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2032
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xbd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2016
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xbc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2000
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xbb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1984
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xba4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x790
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1952
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x780
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1936
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x770
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1920
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x760
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1904
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x750
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1888
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x740
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1872
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x730
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1856
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x720
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1840
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x710
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1824
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x700
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1808
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xaf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1792
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xae4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1776
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xad4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1760
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xac4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1744
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xab4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1728
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xaa4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1712
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x690
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1696
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x680
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1680
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x670
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1664
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x660
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1648
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x650
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1632
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x640
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1616
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x630
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1600
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x620
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1584
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x610
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1568
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x600
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1552
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1536
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1520
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1504
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1488
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1472
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x9a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1456
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x994
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x590
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1440
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x984
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x580
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1424
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x974
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x570
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1408
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x964
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x560
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1392
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x954
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x550
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1376
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x944
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x540
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1360
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x934
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x530
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1344
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x924
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x520
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1328
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x914
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x510
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1312
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x904
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x500
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1296
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1280
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1264
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1248
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1232
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1216
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x8a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1200
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x894
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x490
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1184
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x884
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x480
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1168
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x874
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x470
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1152
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x864
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x460
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1136
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x854
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x450
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1120
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x844
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x440
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1104
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x834
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x430
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1088
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x824
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x420
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1072
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x814
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x410
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1056
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x804
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x400
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1040
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x3f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1024
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x3e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1008
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x3d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:992
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x3c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:976
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x3b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:960
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x7a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x3a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:944
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x794
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x390
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:928
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x784
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x380
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:912
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x774
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x370
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:896
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x764
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x360
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:880
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x754
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x350
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:864
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x744
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x340
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:848
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x734
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x330
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:832
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x724
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x320
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:816
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x714
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x310
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:800
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x704
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x300
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:784
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x2f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:768
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x2e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:752
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x2d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:736
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x2c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:720
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x2b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:704
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x6a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x2a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:688
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x694
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x290
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:672
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x684
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x280
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:656
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x674
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x270
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:640
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x664
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x260
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:624
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x654
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x250
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:608
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x644
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x240
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:592
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x634
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x230
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:576
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x624
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x220
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:560
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x614
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x210
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:544
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x604
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x200
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:528
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1f0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:512
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1e0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:496
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1d0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:480
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:464
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:448
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x5a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x1a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:432
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x594
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x190
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:416
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x584
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x180
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:400
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x574
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x170
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:384
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x564
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x160
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:368
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x554
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x150
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:352
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x544
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x140
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:336
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x534
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x130
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:320
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x524
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x120
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:304
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x514
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x110
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:288
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x504
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x100
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:272
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xf0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:256
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xe0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:240
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xd0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:224
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xc0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:208
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xb0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:192
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x4a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0xa0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:176
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x494
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x90
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:160
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x484
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x80
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:144
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x474
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x70
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:128
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x464
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x60
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:112
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x454
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x50
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:96
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x444
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s2, 64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:80
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x434
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s2, 48
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:64
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x424
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s2, 32
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:48
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x414
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s2, 16
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:32
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x404
+; GFX9-FLATSCR-NEXT:    s_mov_b32 s2, 0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:16
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x3f4
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x400, v4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x3e4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4080
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x3d4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4064
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x3c4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4048
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x3b4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4032
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x3a4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4016
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x394
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4000
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x384
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3984
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x374
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x380, v4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x364
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4080
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x354
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4064
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x344
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4048
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x334
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4032
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x324
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4016
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x314
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4000
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x304
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3984
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2f4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x300, v4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2e4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4080
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2d4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4064
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2c4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4048
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2b4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4032
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2a4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4016
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x294
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4000
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x284
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3984
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x274
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x280, v4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x264
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4080
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x254
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4064
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x244
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4048
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x234
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4032
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x224
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4016
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x214
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4000
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x204
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3984
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x1f4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x200, v4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x1e4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4080
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x1d4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4064
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x1c4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4048
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x1b4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4032
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x1a4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4016
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x194
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4000
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x184
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3984
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x174
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x180, v4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x164
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4080
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x154
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4064
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x144
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4048
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x134
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4032
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x124
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4016
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x114
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4000
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x104
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3984
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0xf4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0xe4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4080
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0xd4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4064
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0xc4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4048
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0xb4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4032
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0xa4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4016
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x94
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:4000
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x84
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3984
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x74
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x80, v4
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x64
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:4080
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x54
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:4064
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x44
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:4048
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, 52
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:4032
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, 36
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:4016
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, 20
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:4000
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_mov_b32 s0, 4
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:3984
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:3968
 ; GFX9-FLATSCR-NEXT:    s_endpgm
 ;
 ; GFX10-FLATSCR-LABEL: test:
@@ -7295,13 +6745,13 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX10-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 ; GFX10-FLATSCR-NEXT:    v_lshlrev_b32_e32 v5, 13, v0
 ; GFX10-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v4, s4, s2, v5
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v22, null, s3, 0, s4
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v2, s4, s2, v5
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s4
 ; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x804
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x80, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v22, vcc_lo
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v2, vcc_lo, 0x100, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x80, v2
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:4 ; 16-byte Folded Spill
@@ -7324,1181 +6774,682 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:100 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2032
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x100, v2
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:116 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:1920
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:132 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:1936
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:148 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:1952
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:164 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:1968
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:180 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:1984
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:196 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:2000
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:212 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:2016
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:228 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:2032
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2032
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x180, v2
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:244 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v6, vcc_lo, 0x180, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v22, vcc_lo
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v[6:7], off offset:1920
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], off offset:260 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v[6:7], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:260 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], off offset:276 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v[6:7], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:276 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], off offset:292 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v[6:7], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:292 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], off offset:308 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v[6:7], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:308 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], off offset:324 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v[6:7], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:324 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], off offset:340 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v[6:7], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:340 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], off offset:356 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v[6:7], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:356 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2032
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x200, v2
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], off offset:372 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v8, vcc_lo, 0x200, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v22, vcc_lo
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[10:13], v[8:9], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:372 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[10:13], off offset:388 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[10:13], v[8:9], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:388 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[10:13], off offset:404 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[10:13], v[8:9], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:404 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[10:13], off offset:420 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[10:13], v[8:9], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:420 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[10:13], off offset:436 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[10:13], v[8:9], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:436 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[10:13], off offset:452 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[10:13], v[8:9], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:452 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[10:13], off offset:468 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[10:13], v[8:9], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:468 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[10:13], off offset:484 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[10:13], v[8:9], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:484 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2032
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x280, v2
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[10:13], off offset:500 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v10, vcc_lo, 0x280, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, v22, vcc_lo
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[12:15], v[10:11], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:500 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[12:15], off offset:516 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[12:15], v[10:11], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:516 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[12:15], off offset:532 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[12:15], v[10:11], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:532 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[12:15], off offset:548 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[12:15], v[10:11], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:548 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[12:15], off offset:564 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[12:15], v[10:11], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:564 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[12:15], off offset:580 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[12:15], v[10:11], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:580 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[12:15], off offset:596 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[12:15], v[10:11], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:596 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[12:15], off offset:612 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[12:15], v[10:11], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:612 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2032
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x300, v2
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[12:15], off offset:628 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v12, vcc_lo, 0x300, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v13, null, 0, v22, vcc_lo
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[14:17], v[12:13], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:628 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[14:17], off offset:644 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[14:17], v[12:13], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:644 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[14:17], off offset:660 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[14:17], v[12:13], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:660 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[14:17], off offset:676 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[14:17], v[12:13], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:676 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[14:17], off offset:692 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[14:17], v[12:13], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:692 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[14:17], off offset:708 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[14:17], v[12:13], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:708 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[14:17], off offset:724 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[14:17], v[12:13], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:724 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[14:17], off offset:740 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[14:17], v[12:13], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:740 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2032
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x380, v2
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[14:17], off offset:756 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v14, vcc_lo, 0x380, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v15, null, 0, v22, vcc_lo
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v[14:15], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:756 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], off offset:772 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v[14:15], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:772 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], off offset:788 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v[14:15], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:788 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], off offset:804 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v[14:15], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:804 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], off offset:820 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v[14:15], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:820 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], off offset:836 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v[14:15], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:836 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], off offset:852 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v[14:15], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:852 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], off offset:868 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v[14:15], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:868 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2032
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x400, v2
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], off offset:884 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v16, vcc_lo, 0x400, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v17, null, 0, v22, vcc_lo
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[18:21], v[16:17], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:884 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[18:21], off offset:900 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[18:21], v[16:17], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:900 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[18:21], off offset:916 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[18:21], v[16:17], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:916 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[18:21], off offset:932 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[18:21], v[16:17], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:932 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[18:21], off offset:948 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[18:21], v[16:17], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:948 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[18:21], off offset:964 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[18:21], v[16:17], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:964 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[18:21], off offset:980 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[18:21], v[16:17], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:980 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[18:21], off offset:996 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[18:21], v[16:17], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:996 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2032
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x480, v2
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[18:21], off offset:1012 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v18, vcc_lo, 0x480, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v19, null, 0, v22, vcc_lo
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v20, vcc_lo, 0x500, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[18:19], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:1012 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1028 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[18:19], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:1028 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1044 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[18:19], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:1044 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1060 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[18:19], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:1060 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1076 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[18:19], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:1076 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1092 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[18:19], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:1092 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1108 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[18:19], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:1108 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    s_clause 0x1
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2032
+; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], off offset:1124 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1124 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[18:19], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1140 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3]
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1140 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1156 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1156 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1172 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1172 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1188 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1188 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1204 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1204 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1220 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1220 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1236 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1236 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1252 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1252 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2032
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v20, vcc_lo, 0x580, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1268 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1268 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1284 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1284 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1300 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1300 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1316 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1316 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1332 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1332 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1348 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1348 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1364 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1364 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1380 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1380 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2032
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v20, vcc_lo, 0x600, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1396 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1396 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1412 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1412 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1428 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1428 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1444 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1444 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1460 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1460 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1476 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1476 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1492 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1492 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1508 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1508 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2032
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v20, vcc_lo, 0x680, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1524 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1524 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1540 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1540 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1556 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1556 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1572 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1572 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1588 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1588 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1604 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1604 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1620 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1620 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1636 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1636 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2032
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v20, vcc_lo, 0x700, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1652 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1652 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1668 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1668 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1684 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1684 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1700 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1700 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1716 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1716 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1732 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1748 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2016
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1764 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v[20:21], off offset:2032
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v20, vcc_lo, 0x780, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[23:26], off offset:1780 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[22:25], v[20:21], off offset:1920
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[22:25], off offset:1796 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[22:25], v[20:21], off offset:1936
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[22:25], off offset:1812 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[22:25], v[20:21], off offset:1952
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[22:25], off offset:1828 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[22:25], v[20:21], off offset:1968
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[22:25], off offset:1844 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[22:25], v[20:21], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[22:25], off offset:1860 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[22:25], v[20:21], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[22:25], off offset:1876 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[22:25], v[20:21], off offset:2016
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[22:25], off offset:1892 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[20:21], off offset:2032
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], off offset:1908 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1920
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], off offset:1924 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], off offset:1940 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], off offset:1956 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], off offset:1972 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], off offset:1988 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], off offset:2004 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:2016
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], off offset:2020 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v2
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], off offset:2036 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1920
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x814
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x824
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x834
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x844
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x854
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_clause 0x1
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:2016
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x864
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x874
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v6
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v7, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x884
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1920
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x894
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8a4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8b4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8c4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8d4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_clause 0x1
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:2016
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8e4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8f4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v8
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v9, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x904
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x914
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x924
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x934
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x944
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x954
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_clause 0x1
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x964
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x974
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v10
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v11, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x984
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x994
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9a4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9b4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9c4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9d4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_clause 0x1
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9e4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9f4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v12
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa04
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa14
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa24
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa34
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa44
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa54
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_clause 0x1
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa64
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa74
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v14
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v15, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa84
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa94
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xaa4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xab4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xac4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xad4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_clause 0x1
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xae4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xaf4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v16
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v17, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb04
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb14
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb24
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb34
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb44
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb54
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_clause 0x1
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb64
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb74
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v18
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v19, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb84
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1920
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb94
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xba4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xbb4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xbc4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xbd4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_clause 0x1
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:2016
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xbe4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xbf4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3]
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc04
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc14
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc24
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc34
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc44
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc54
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc64
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc74
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc84
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc94
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xca4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xcb4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xcc4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xcd4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xce4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xcf4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xd04
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xd14
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xd24
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xd34
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xd44
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xd54
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xd64
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xd74
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xd84
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xd94
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xda4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xdb4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xdc4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xdd4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xde4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xdf4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xe04
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xe14
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xe24
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xe34
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xe44
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1732 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:592
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xe54
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1748 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:608
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xe64
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1764 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:624
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xe74
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1780 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:640
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xe84
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1796 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:656
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xe94
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1812 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:672
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xea4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1828 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:688
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xeb4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1844 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:704
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xec4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1860 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:720
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xed4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1876 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:736
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xee4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1892 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:752
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xef4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1908 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:768
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xf04
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1924 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:784
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xf14
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1940 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:800
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xf24
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1956 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:816
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xf34
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1972 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:832
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xf44
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:1988 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:848
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xf54
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:2004 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:864
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xf64
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:2020 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:880
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xf74
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:2036 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:896
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xf84
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:912
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xf94
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x814
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:928
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xfa4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x824
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:944
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xfb4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x834
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:960
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xfc4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x844
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:976
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xfd4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x854
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:992
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xfe4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x864
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xff4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x874
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1004
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x884
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1014
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x894
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1024
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8a4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1034
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8b4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1044
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8c4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1054
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8d4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1064
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8e4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1074
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x8f4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1084
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x904
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1094
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x914
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x10a4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x924
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x10b4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x934
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x10c4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x944
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x10d4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x954
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x10e4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x964
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x10f4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x974
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1104
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x984
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1114
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x994
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1124
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9a4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1134
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9b4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1144
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9c4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1154
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9d4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1164
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9e4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1174
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x9f4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1184
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa04
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1194
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa14
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x11a4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa24
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x11b4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa34
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x11c4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa44
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x11d4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa54
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x11e4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa64
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x11f4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa74
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1204
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa84
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1214
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xa94
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1224
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xaa4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1234
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xab4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1244
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xac4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1254
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xad4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1264
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xae4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1274
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xaf4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1284
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb04
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1294
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb14
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x12a4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb24
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x12b4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb34
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x12c4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb44
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x12d4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb54
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x12e4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb64
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x12f4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb74
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1304
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb84
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1314
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xb94
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1324
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xba4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1334
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xbb4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1344
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xbc4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1354
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xbd4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1364
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xbe4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1374
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xbf4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1384
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc04
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x1394
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc14
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x13a4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc24
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x13b4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc34
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x13c4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc44
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x13d4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc54
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0x13e4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s4, 0xc64
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032
@@ -8506,7 +7457,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    v_add_co_u32 v4, s2, s0, v5
 ; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v6, null, s1, 0, s2
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x13e4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc64
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
@@ -8519,1203 +7470,710 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:1140 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2032
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x13d4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc54
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2016
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x13c4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc44
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:2000
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x13b4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc34
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1984
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x13a4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc24
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1968
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1394
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc14
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1952
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1384
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc04
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1936
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1374
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xbf4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1920
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1364
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xbe4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1904
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1354
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xbd4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1888
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1344
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xbc4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1872
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1334
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xbb4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1856
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1324
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xba4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1840
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1314
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xb94
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1824
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1304
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xb84
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1808
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x12f4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xb74
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1792
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x12e4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xb64
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1776
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x12d4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xb54
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1760
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x12c4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xb44
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1744
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x12b4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xb34
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1728
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x12a4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xb24
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1712
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1294
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xb14
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1696
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1284
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xb04
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1680
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1274
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xaf4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1664
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1264
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xae4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1648
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1254
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xad4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1632
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1244
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xac4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1616
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xab4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1600
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1224
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xaa4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1584
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1214
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xa94
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1568
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1204
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xa84
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1552
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x11f4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xa74
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1536
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x11e4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xa64
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1520
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x11d4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xa54
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1504
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x11c4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xa44
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1488
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x11b4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xa34
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1472
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x11a4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xa24
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1456
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1194
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xa14
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1440
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1184
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xa04
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1424
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1174
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x9f4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1408
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1164
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x9e4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1392
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1154
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x9d4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1376
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1144
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x9c4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1360
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1134
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x9b4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1344
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1124
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x9a4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1328
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1114
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x994
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1312
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1104
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x984
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1296
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x10f4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x974
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1280
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x10e4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x964
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1264
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x10d4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x954
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1248
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x10c4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x944
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1232
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x10b4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x934
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1216
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x10a4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x924
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1200
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1094
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x914
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1184
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1084
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x904
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1168
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1074
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x8f4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1152
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1064
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x8e4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1136
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1054
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x8d4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1120
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1044
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x8c4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1104
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1034
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x8b4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1088
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1024
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x8a4
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1072
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1014
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x894
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1056
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x1004
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x884
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1040
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xff4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x874
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1024
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xfe4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x864
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:1008
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xfd4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x854
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:992
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xfc4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x844
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:976
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xfb4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x834
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:960
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xfa4
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x824
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:944
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xf94
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x814
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:928
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xf84
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0x804
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:912
 ; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xf74
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:896
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xf64
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:2036 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:880
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xf54
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:2020 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:864
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xf44
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:2004 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:848
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xf34
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1988 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:832
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xf24
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1972 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:816
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xf14
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1956 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:800
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xf04
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1940 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:784
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xef4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1924 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:768
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xee4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1908 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:752
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xed4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1892 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:736
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xec4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1876 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:720
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xeb4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1860 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:704
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xea4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1844 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:688
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xe94
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1828 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:672
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xe84
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1812 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:656
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xe74
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1796 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:640
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xe64
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1780 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:624
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xe54
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1764 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:608
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xe44
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1748 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:592
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xe34
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1732 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:576
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xe24
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1716 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:560
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xe14
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1700 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:544
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xe04
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1684 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:528
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xdf4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1668 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:512
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xde4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1652 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:496
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xdd4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1636 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:480
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xdc4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1620 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:464
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xdb4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1604 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:448
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xda4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1588 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:432
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xd94
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1572 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:416
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xd84
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1556 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:400
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xd74
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1540 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:384
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xd64
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1524 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:368
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xd54
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1508 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:352
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xd44
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1492 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:336
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xd34
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1476 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:320
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xd24
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1460 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:304
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xd14
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1444 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:288
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xd04
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1428 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:272
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xcf4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1412 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:256
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xce4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1396 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:240
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xcd4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1380 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:224
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xcc4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1364 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:208
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xcb4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1348 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:192
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xca4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1332 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:176
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc94
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1316 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:160
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc84
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1300 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:144
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc74
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1284 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:128
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc64
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1268 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:112
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc54
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1252 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:96
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc44
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1236 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:80
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc34
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1220 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:64
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc24
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1204 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:48
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc14
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1188 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:32
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s2, 0xc04
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1172 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1] offset:16
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:1156 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xbf4
 ; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x480, v4
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v2, vcc_lo, 0x780, v0
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xbe4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[7:10], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xbd4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[7:10], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xbc4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[7:10], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xbb4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[7:10], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xba4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[7:10], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xb94
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[7:10], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xb84
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[7:10], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xb74
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[7:10], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[9:12], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v2, vcc_lo, 0x400, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xb64
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v7, vcc_lo, 0x780, v2
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v3, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[9:12], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[9:12], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xb54
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[9:12], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[9:12], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xb44
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[9:12], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[9:12], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xb34
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[9:12], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[9:12], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xb24
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[9:12], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[9:12], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xb14
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[9:12], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[9:12], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xb04
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[9:12], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[9:12], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xaf4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[9:12], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v7, vcc_lo, 0x380, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xae4
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v9, vcc_lo, 0x780, v7
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v8, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[11:14], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xad4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[11:14], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xac4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[11:14], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xab4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[11:14], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xaa4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[11:14], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xa94
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[11:14], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xa84
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[11:14], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xa74
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[11:14], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[13:16], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v9, vcc_lo, 0x300, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xa64
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v11, vcc_lo, 0x780, v9
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v10, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[13:16], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[13:16], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xa54
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[13:16], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[13:16], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xa44
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[13:16], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[13:16], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xa34
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[13:16], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[13:16], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xa24
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[13:16], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[13:16], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xa14
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[13:16], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[13:16], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0xa04
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[13:16], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[13:16], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x9f4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[13:16], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v11, vcc_lo, 0x280, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x9e4
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v13, vcc_lo, 0x780, v11
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v12, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[15:18], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x9d4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[15:18], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x9c4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[15:18], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x9b4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[15:18], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x9a4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[15:18], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x994
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[15:18], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x984
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[15:18], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x974
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[15:18], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[17:20], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v13, vcc_lo, 0x200, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x964
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v15, vcc_lo, 0x780, v13
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v14, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[17:20], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[17:20], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x954
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[17:20], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[17:20], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x944
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[17:20], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[17:20], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x934
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[17:20], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[17:20], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x924
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[17:20], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[17:20], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x914
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[17:20], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[17:20], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x904
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[17:20], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[17:20], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x8f4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[17:20], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v15, vcc_lo, 0x180, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x8e4
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v17, vcc_lo, 0x780, v15
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v18, null, 0, v16, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[19:22], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x8d4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[19:22], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x8c4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[19:22], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x8b4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[19:22], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x8a4
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[19:22], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x894
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[19:22], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x884
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[19:22], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x874
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[19:22], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v17, vcc_lo, 0x100, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v18, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x864
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v19, vcc_lo, 0x780, v17
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v20, null, 0, v18, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[21:24], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x854
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[21:24], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x844
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[21:24], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x834
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[21:24], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x824
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[21:24], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x814
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[21:24], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x804
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[21:24], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[21:24], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:2036 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v19, vcc_lo, 0x80, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v20, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v21, vcc_lo, 0x780, v19
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:2020 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:2004 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1988 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1972 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1956 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1940 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1924 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1908 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v21, vcc_lo, 0x780, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1892 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1876 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1860 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1844 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1828 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1812 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1796 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1780 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v21, vcc_lo, 0x700, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1764 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1748 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1732 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1716 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1700 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1684 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1668 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1652 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v21, vcc_lo, 0x680, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1636 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1620 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1604 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1588 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1572 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1556 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1540 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1524 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v21, vcc_lo, 0x600, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1508 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1492 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1476 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1460 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1444 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1428 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1412 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1396 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v21, vcc_lo, 0x580, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v4, vcc_lo, 0x500, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1380 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1364 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1348 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1332 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1316 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1300 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[23:26], off, off offset:1284 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[21:22], v[23:26], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1268 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[21:24], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1252 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[21:24], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1236 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[21:24], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1220 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[21:24], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1204 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[21:24], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1188 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[21:24], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1172 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[21:24], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1156 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[21:24], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1140 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[21:24], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1124 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:1124 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[21:24], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1108 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:1108 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[21:24], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1092 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:1092 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[21:24], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1076 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:1076 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[21:24], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1060 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:1060 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[21:24], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1044 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:1044 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[21:24], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1028 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:1028 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[21:24], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:1012 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:1012 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x400, v4
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[21:24], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:996 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:996 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[21:24], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:980 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:980 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[21:24], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:964 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:964 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[21:24], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:948 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:948 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[21:24], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:932 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:932 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[21:24], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:916 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:916 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[21:24], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[21:24], off, off offset:900 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:900 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[2:3], v[21:24], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:884 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:884 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x380, v4
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:868 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:868 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:852 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:852 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:836 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:836 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:820 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:820 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:804 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:804 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:788 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:788 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:772 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:772 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:756 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:756 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x300, v4
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[0:3], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:740 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:740 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[0:3], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:724 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:724 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[0:3], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:708 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:708 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[0:3], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:692 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:692 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[0:3], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:676 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:676 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[0:3], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:660 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:660 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[0:3], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:644 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:644 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[9:10], v[0:3], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:628 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:628 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x280, v4
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:612 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:612 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:596 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:596 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:580 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:580 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:564 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:564 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:548 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:548 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:532 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:532 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:516 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:516 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:500 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:500 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x200, v4
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[0:3], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:484 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:484 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[0:3], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:468 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:468 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[0:3], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:452 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:452 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[0:3], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:436 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:436 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[0:3], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:420 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:420 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[0:3], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:404 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:404 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[0:3], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:388 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:388 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[13:14], v[0:3], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:372 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:372 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x180, v4
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[0:3], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:356 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:356 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[0:3], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:340 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:340 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[0:3], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:324 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:324 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[0:3], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:308 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:308 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[0:3], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:292 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:292 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[0:3], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:276 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:276 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[0:3], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:260 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:260 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[15:16], v[0:3], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:244 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1920
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:244 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x100, v4
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[0:3], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:228 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:228 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[0:3], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:212 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:212 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[0:3], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:196 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:196 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[0:3], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:180 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:180 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[0:3], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:164 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:164 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[0:3], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:148 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:148 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[0:3], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:132 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, off offset:132 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[17:18], v[0:3], off offset:1920
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:116 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:1920
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x80, v4
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, off offset:116 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT:    v_add_co_u32 v0, vcc_lo, 0x780, v0
+; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[0:3], off offset:2032
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:100 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:2032
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, off offset:100 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[0:3], off offset:2016
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:84 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:2016
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, off offset:84 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[0:3], off offset:2000
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:68 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:2000
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, off offset:68 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[0:3], off offset:1984
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:52 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:1984
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, off offset:52 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[0:3], off offset:1968
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:36 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:1968
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, off offset:36 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[0:3], off offset:1952
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:20 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:1952
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, off offset:20 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[0:3], off offset:1936
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:4 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:1936
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[2:5], off, off offset:4 ; 16-byte Folded Reload
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[19:20], v[0:3], off offset:1920
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:1920
 ; GFX10-FLATSCR-NEXT:    s_endpgm
 entry:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)



More information about the llvm-commits mailing list