[llvm] e302711 - RegAllocGreedy: Try local instruction splitting with subranges

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 12 06:23:59 PDT 2022


Author: Matt Arsenault
Date: 2022-09-12T09:03:55-04:00
New Revision: e30271169fa9c3e2b8b55e3d7cb73706a06eadd8

URL: https://github.com/llvm/llvm-project/commit/e30271169fa9c3e2b8b55e3d7cb73706a06eadd8
DIFF: https://github.com/llvm/llvm-project/commit/e30271169fa9c3e2b8b55e3d7cb73706a06eadd8.diff

LOG: RegAllocGreedy: Try local instruction splitting with subranges

This was only trying this to relax register class constraints, but
this can also help if there are subranges involved.

This solves a compilation failure for AMDGPU when there is high
pressure created by large register tuples. If one virtual register is
using most of the available budget, we need to be able to evict
subranges.

This solves the immediate failure, but this solution leaves a lot to
be desired. In the relevant testcases, we have 32-element tuples but
most of the uses are operations on 1 element subranges of it. What
we're now getting is a spill and restore of the full 1024 bits and an
extract of the used 32-bits. It would be far better if we introduced a
copy to a new virtual register with a smaller register class and used
narrower spills.

Furthermore, we could probably do a better job if the allocator were
to introduce new subranges where none previously existed in the
highest pressure scenarios. The block and region splits should also
try to split specific subranges out.

The mve-vst3.ll test changes looks like noise to me, but instruction
count increased by one. mve-vst4.ll looks like a solid improvement
with several 16-byte spills eliminated. splitkit-copy-live-lanes.mir
also shows a solid reduction in total spill count.

This could use more tests but it's pretty tiring to come up with cases
that fail on this.

Added: 
    llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
    llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
    llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll

Modified: 
    llvm/lib/CodeGen/RegAllocGreedy.cpp
    llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
    llvm/test/CodeGen/Thumb2/mve-vst3.ll
    llvm/test/CodeGen/Thumb2/mve-vst4.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 2c21e7d346fbc..8c412b2aeb12c 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1241,6 +1241,55 @@ static unsigned getNumAllocatableRegsForConstraints(
   return RCI.getNumAllocatableRegs(ConstrainedRC);
 }
 
+static LaneBitmask getInstReadLaneMask(const MachineRegisterInfo &MRI,
+                                       const TargetRegisterInfo &TRI,
+                                       const MachineInstr &MI, Register Reg) {
+  LaneBitmask Mask;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg == 0 && MO.isUse()) {
+      Mask |= MRI.getMaxLaneMaskForVReg(Reg);
+      continue;
+    }
+
+    LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
+    if (MO.isDef()) {
+      if (!MO.isUndef())
+        Mask |= ~SubRegMask;
+    } else
+      Mask |= SubRegMask;
+  }
+
+  return Mask;
+}
+
+/// Return true if \p MI at \P Use reads a subset of the lanes live in \p
+/// VirtReg.
+static bool readsLaneSubset(const MachineRegisterInfo &MRI,
+                            const MachineInstr *MI, const LiveInterval &VirtReg,
+                            const TargetRegisterInfo *TRI, SlotIndex Use) {
+  // Early check the common case.
+  if (MI->isCopy() &&
+      MI->getOperand(0).getSubReg() == MI->getOperand(1).getSubReg())
+    return false;
+
+  // FIXME: We're only considering uses, but should be consider defs too?
+  LaneBitmask ReadMask = getInstReadLaneMask(MRI, *TRI, *MI, VirtReg.reg());
+
+  LaneBitmask LiveAtMask;
+  for (const LiveInterval::SubRange &S : VirtReg.subranges()) {
+    if (S.liveAt(Use))
+      LiveAtMask |= S.LaneMask;
+  }
+
+  // If the live lanes aren't 
diff erent from the lanes used by the instruction,
+  // this doesn't help.
+  return (ReadMask & ~(LiveAtMask & TRI->getCoveringLanes())).any();
+}
+
 /// tryInstructionSplit - Split a live range around individual instructions.
 /// This is normally not worthwhile since the spiller is doing essentially the
 /// same thing. However, when the live range is in a constrained register
@@ -1253,8 +1302,13 @@ unsigned RAGreedy::tryInstructionSplit(const LiveInterval &VirtReg,
                                        SmallVectorImpl<Register> &NewVRegs) {
   const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg());
   // There is no point to this if there are no larger sub-classes.
-  if (!RegClassInfo.isProperSubClass(CurRC))
-    return 0;
+
+  bool SplitSubClass = true;
+  if (!RegClassInfo.isProperSubClass(CurRC)) {
+    if (!VirtReg.hasSubRanges())
+      return 0;
+    SplitSubClass = false;
+  }
 
   // Always enable split spill mode, since we're effectively spilling to a
   // register.
@@ -1277,14 +1331,19 @@ unsigned RAGreedy::tryInstructionSplit(const LiveInterval &VirtReg,
   // Otherwise, splitting just inserts uncoalescable copies that do not help
   // the allocation.
   for (const SlotIndex Use : Uses) {
-    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use))
+    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use)) {
       if (MI->isFullCopy() ||
-          SuperRCNumAllocatableRegs ==
-              getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
-                                                  TII, TRI, RegClassInfo)) {
+          (SplitSubClass &&
+           SuperRCNumAllocatableRegs ==
+               getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
+                                                   TII, TRI, RegClassInfo)) ||
+          // TODO: Handle split for subranges with subclass constraints?
+          (!SplitSubClass && VirtReg.hasSubRanges() &&
+           !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use))) {
         LLVM_DEBUG(dbgs() << "    skip:\t" << Use << '\t' << *MI);
         continue;
       }
+    }
     SE->openIntv();
     SlotIndex SegStart = SE->enterIntvBefore(Use);
     SlotIndex SegStop = SE->leaveIntvAfter(Use);

diff  --git a/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir b/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
new file mode 100644
index 0000000000000..f4e2a4991bd0f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-regalloc -stress-regalloc=3 -start-before=greedy,1 -stop-before=virtregrewriter,1 -o - %s | FileCheck %s
+---
+name: split_instruction_subranges
+alignment:       1
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    1
+  hasCalls:        true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr17' }
+  occupancy:       8
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: split_instruction_subranges
+    ; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit %9.sub1
+    ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %11.sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit %11.sub0
+    ; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit %7.sub1
+    ; CHECK-NEXT: S_ENDPGM 0
+    %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
+    %3:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %6:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
+    S_NOP 0, implicit %1.sub1
+    S_NOP 0, implicit %2.sub0
+    S_NOP 0, implicit %3.sub1
+    S_ENDPGM 0
+
+...
+
+---
+name: split_instruction_subranges_use_is_subreg_def
+alignment:       1
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    1
+  hasCalls:        true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr17' }
+  occupancy:       8
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: split_instruction_subranges_use_is_subreg_def
+    ; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR2]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
+    ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %13.sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit-def %13.sub1
+    ; CHECK-NEXT: undef %15.sub0:vreg_64 = COPY %13.sub0
+    ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[SI_SPILL_V64_RESTORE1]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit-def %7.sub0
+    ; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY %7.sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; CHECK-NEXT: undef %14.sub0:vreg_64 = COPY %15.sub0
+    ; CHECK-NEXT: S_NOP 0, implicit %14.sub0
+    ; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY %9.sub1
+    ; CHECK-NEXT: S_NOP 0, implicit %8.sub1
+    ; CHECK-NEXT: S_ENDPGM 0
+    %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
+    %3:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %6:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
+    S_NOP 0, implicit-def %1.sub0
+    S_NOP 0, implicit-def %2.sub1
+    S_NOP 0, implicit-def %3.sub0
+    S_NOP 0, implicit %1.sub1
+    S_NOP 0, implicit %2.sub0
+    S_NOP 0, implicit %3.sub1
+    S_ENDPGM 0
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
new file mode 100644
index 0000000000000..b1f864fa92eac
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
@@ -0,0 +1,418 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-regalloc -start-before=greedy,0 -stop-after=virtregrewriter,0  -greedy-regclass-priority-trumps-globalness=1 -o - %s | FileCheck %s
+
+# The allocation would previously fail due to poor ordering based on
+# register class. The super wide tuples should be allocated first so
+# that we don't need to try to evict them later. Currently we cannot
+# partially evict interfering register tuples.
+
+---
+name:            need_large_tuple_split
+alignment:       1
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 1, class: sreg_64, preferred-register: '$vcc' }
+  - { id: 2, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 3, class: sreg_64, preferred-register: '$vcc' }
+  - { id: 4, class: sreg_64, preferred-register: '$vcc' }
+  - { id: 5, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 6, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 7, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 8, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 9, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 10, class: sreg_64_xexec, preferred-register: '$vcc' }
+frameInfo:
+  maxAlignment:    1
+  hasCalls:        true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr17' }
+  occupancy:       8
+body:             |
+  ; CHECK-LABEL: name: need_large_tuple_split
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr33 = COPY $sgpr14
+  ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr18_sgpr19 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr20_sgpr21 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr22_sgpr23 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr52 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr24_sgpr25 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr53 = S_MOV_B32 1083786240
+  ; CHECK-NEXT:   SI_SPILL_S1024_SAVE renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.17(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.17
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr52 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr53 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr54 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr55 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr56 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr57 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr58 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr59 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr60 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr61 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr62 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr63 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr64 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr65 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr66 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr67 = COPY killed renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+  ; CHECK-NEXT:   renamable $sgpr52 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr53 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr54 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr55 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr56 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr57 = COPY killed renamable $sgpr76
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr58 = COPY killed renamable $sgpr76
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr59 = COPY killed renamable $sgpr76
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr60 = COPY killed renamable $sgpr76
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr61 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr62 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr63 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr64 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr65 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr66 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr67 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.11, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr60 = COPY killed renamable $sgpr33
+  ; CHECK-NEXT:   renamable $sgpr62 = COPY killed renamable $sgpr15
+  ; CHECK-NEXT:   SI_SPILL_S32_SAVE killed renamable $sgpr16, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5)
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, CustomRegMask($sgpr60,$sgpr62)
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.17(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr60, $sgpr62
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   $sgpr12 = COPY killed renamable $sgpr60
+  ; CHECK-NEXT:   $sgpr13 = COPY killed renamable $sgpr62
+  ; CHECK-NEXT:   $sgpr14 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5)
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_noregs, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   S_BRANCH %bb.17
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.12(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr20_sgpr21, undef renamable $sgpr88_sgpr89, implicit-def dead $scc
+  ; CHECK-NEXT:   renamable $sgpr88_sgpr89 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr4_sgpr5
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.12, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead %27:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr22_sgpr23, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr90_sgpr91 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr92_sgpr93 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead %30:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr33, 11, implicit-def $m0, implicit $m0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.10(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr90_sgpr91, $sgpr92_sgpr93, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr90_sgpr91, implicit-def dead $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.10, implicit $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.10(0x40000000), %bb.17(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr90_sgpr91, $sgpr92_sgpr93, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr68_sgpr69, implicit $exec
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
+  ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr64 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY renamable $sgpr34_sgpr35
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53 = COPY killed renamable $sgpr6_sgpr7
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY renamable $sgpr52_sgpr53
+  ; CHECK-NEXT:   renamable $sgpr38_sgpr39 = COPY killed renamable $sgpr10_sgpr11
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY renamable $sgpr38_sgpr39
+  ; CHECK-NEXT:   renamable $sgpr42_sgpr43 = COPY killed renamable $sgpr12_sgpr13
+  ; CHECK-NEXT:   $sgpr12 = COPY renamable $sgpr33
+  ; CHECK-NEXT:   $sgpr13 = COPY renamable $sgpr15
+  ; CHECK-NEXT:   renamable $sgpr36 = COPY killed renamable $sgpr16
+  ; CHECK-NEXT:   renamable $sgpr37 = COPY killed renamable $sgpr15
+  ; CHECK-NEXT:   renamable $sgpr40 = COPY killed renamable $sgpr8
+  ; CHECK-NEXT:   renamable $sgpr44_sgpr45 = COPY killed renamable $sgpr18_sgpr19
+  ; CHECK-NEXT:   renamable $sgpr46_sgpr47 = COPY killed renamable $sgpr20_sgpr21
+  ; CHECK-NEXT:   renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr22_sgpr23
+  ; CHECK-NEXT:   renamable $sgpr50_sgpr51 = COPY killed renamable $sgpr24_sgpr25
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY renamable $sgpr64_sgpr65
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9
+  ; CHECK-NEXT:   renamable $sgpr24_sgpr25 = COPY killed renamable $sgpr50_sgpr51
+  ; CHECK-NEXT:   renamable $sgpr22_sgpr23 = COPY killed renamable $sgpr48_sgpr49
+  ; CHECK-NEXT:   renamable $sgpr20_sgpr21 = COPY killed renamable $sgpr46_sgpr47
+  ; CHECK-NEXT:   renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr44_sgpr45
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = COPY killed renamable $sgpr42_sgpr43
+  ; CHECK-NEXT:   renamable $sgpr8 = COPY killed renamable $sgpr40
+  ; CHECK-NEXT:   renamable $sgpr10_sgpr11 = COPY killed renamable $sgpr38_sgpr39
+  ; CHECK-NEXT:   renamable $sgpr15 = COPY killed renamable $sgpr37
+  ; CHECK-NEXT:   renamable $sgpr16 = COPY killed renamable $sgpr36
+  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr52_sgpr53
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term renamable $sgpr92_sgpr93
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.10, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.17
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.12(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr90_sgpr91, $sgpr92_sgpr93, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.8, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.17(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.17
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12:
+  ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.13(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr88_sgpr89
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.11, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.13:
+  ; CHECK-NEXT:   successors: %bb.15(0x40000000), %bb.14(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.15, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.14
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.14:
+  ; CHECK-NEXT:   successors: %bb.15(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.15:
+  ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.16(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr18_sgpr19, implicit-def dead $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.11, implicit $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.16:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.17(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.17:
+  bb.0:
+    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16
+
+    %11:sgpr_32 = COPY $sgpr16
+    %12:sgpr_32 = COPY $sgpr15
+    %13:sgpr_32 = COPY $sgpr14
+    %14:sgpr_64 = COPY $sgpr10_sgpr11
+    %15:sgpr_64 = COPY $sgpr8_sgpr9
+    %16:sgpr_64 = COPY $sgpr6_sgpr7
+    %17:sgpr_64 = COPY $sgpr4_sgpr5
+    %5:sreg_64_xexec = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec
+    %6:sreg_64_xexec = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
+    %7:sreg_64_xexec = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
+    %8:sreg_64_xexec = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
+    undef %19.sub16:sgpr_1024 = S_MOV_B32 0
+    %9:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %20:sreg_32_xm0_xexec, undef %18:vgpr_32, implicit $exec
+    %21:vreg_1024_align2 = COPY %19, implicit $exec
+    %10:sreg_64_xexec = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec
+    %19.sub17:sgpr_1024 = S_MOV_B32 1083786240
+    S_BRANCH %bb.1
+
+  bb.1:
+    $vcc = S_AND_B64 $exec, %10, implicit-def dead $scc
+    %22:vreg_1024_align2 = COPY %21
+    S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+    S_BRANCH %bb.17
+
+  bb.2:
+    undef %23.sub0:sgpr_1024 = COPY %19.sub16
+    %23.sub1:sgpr_1024 = COPY %19.sub16
+    %23.sub2:sgpr_1024 = COPY %19.sub16
+    %23.sub3:sgpr_1024 = COPY %19.sub16
+    %23.sub4:sgpr_1024 = COPY %19.sub16
+    %23.sub5:sgpr_1024 = COPY %19.sub16
+    %23.sub6:sgpr_1024 = COPY %19.sub16
+    %23.sub7:sgpr_1024 = COPY %19.sub16
+    %23.sub8:sgpr_1024 = COPY %19.sub16
+    %23.sub9:sgpr_1024 = COPY %19.sub16
+    %23.sub10:sgpr_1024 = COPY %19.sub16
+    %23.sub11:sgpr_1024 = COPY %19.sub16
+    %23.sub12:sgpr_1024 = COPY %19.sub16
+    %23.sub13:sgpr_1024 = COPY %19.sub16
+    %23.sub14:sgpr_1024 = COPY %19.sub16
+    %23.sub15:sgpr_1024 = COPY %19.sub16
+    %23.sub16:sgpr_1024 = COPY %19.sub16
+    %23.sub17:sgpr_1024 = COPY %19.sub16
+    %23.sub18:sgpr_1024 = COPY %19.sub16
+    %23.sub19:sgpr_1024 = COPY %19.sub16
+    %23.sub20:sgpr_1024 = COPY %19.sub16
+    %23.sub21:sgpr_1024 = COPY %19.sub16
+    %23.sub22:sgpr_1024 = COPY %19.sub16
+    %23.sub23:sgpr_1024 = COPY %19.sub16
+    %23.sub24:sgpr_1024 = COPY %19.sub16
+    %23.sub25:sgpr_1024 = COPY %19.sub16
+    %23.sub26:sgpr_1024 = COPY %19.sub16
+    %23.sub27:sgpr_1024 = COPY %19.sub16
+    %23.sub28:sgpr_1024 = COPY %19.sub16
+    %23.sub29:sgpr_1024 = COPY %19.sub16
+    %23.sub30:sgpr_1024 = COPY %19.sub16
+    %23.sub31:sgpr_1024 = COPY %19.sub16
+    %21:vreg_1024_align2 = COPY %23, implicit $exec
+    S_CBRANCH_EXECZ %bb.11, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.3:
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    dead $sgpr30_sgpr31 = SI_CALL undef %24:sreg_64_xexec, 0, CustomRegMask($sgpr60,$sgpr62)
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+
+  bb.4:
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    $sgpr12 = COPY %13
+    $sgpr13 = COPY %12
+    $sgpr14 = COPY %11
+    dead $sgpr30_sgpr31 = SI_CALL undef %25:sreg_64, 0, csr_amdgpu_noregs, implicit killed $sgpr12, implicit killed $sgpr13, implicit $sgpr14
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    S_BRANCH %bb.17
+
+  bb.5:
+    %26:sreg_64 = S_AND_B64 %7, undef %3, implicit-def dead $scc
+    %3:sreg_64 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
+    $exec = S_MOV_B64_term %26
+    S_CBRANCH_EXECZ %bb.12, implicit $exec
+
+  bb.6:
+    dead %27:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %8, implicit $exec
+
+  bb.7:
+    %0:sreg_64_xexec = nofpexcept V_CMP_NLT_F64_e64 0, undef %28:sreg_64, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
+    %1:sreg_64 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
+    dead %30:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 %22, undef %13, 11, implicit-def $m0, implicit $m0, implicit $exec
+
+  bb.8:
+    $vcc = S_AND_B64 $exec, %0, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.10, implicit $vcc
+
+  bb.9:
+    %31:vreg_64_align2 = COPY %19.sub16_sub17, implicit $exec
+    GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, %31, undef %24:sreg_64_xexec, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    %32:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %5, implicit $exec
+    dead %33:sreg_64_xexec = V_CMP_NE_U32_e64 1, %32, implicit $exec
+    undef %34.sub0:sreg_64 = S_ADD_U32 %15.sub0, 32, implicit-def dead $scc
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    $sgpr4_sgpr5 = COPY %17
+    $sgpr6_sgpr7 = COPY %16
+    $sgpr10_sgpr11 = COPY %14
+    $sgpr12 = COPY %13
+    $sgpr13 = COPY %12
+    dead $sgpr30_sgpr31 = SI_CALL undef %33, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    $sgpr8_sgpr9 = COPY %34
+    dead $sgpr30_sgpr31 = SI_CALL undef %33, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    $exec = S_MOV_B64_term %1
+    S_CBRANCH_EXECZ %bb.10, implicit $exec
+    S_BRANCH %bb.17
+
+  bb.10:
+    S_CBRANCH_EXECZ %bb.8, implicit $exec
+    S_BRANCH %bb.12
+
+  bb.11:
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+    S_BRANCH %bb.17
+
+  bb.12:
+    $exec = S_MOV_B64_term %3
+    S_CBRANCH_EXECZ %bb.11, implicit $exec
+
+  bb.13:
+    $vcc = S_AND_B64 $exec, %9, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.15, implicit $vcc
+    S_BRANCH %bb.14
+
+  bb.14:
+
+  bb.15:
+    $vcc = S_AND_B64 $exec, %6, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.11, implicit $vcc
+
+  bb.16:
+    S_CBRANCH_EXECZ %bb.3, implicit $exec
+
+  bb.17:
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
index 19a1b54477747..b9fa585409df5 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
@@ -31,46 +31,28 @@ body:             |
     ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef %47.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %47, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %52.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %52, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %57.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %57, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %62.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %62, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %67.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %67, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %72.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %72, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %77.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %77, %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %82.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %82, %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %87.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %91.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %95.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %95, %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %19.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %153.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %153, %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %102.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %106.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %106, %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %111.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %54.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %61.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %68.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %75.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %82.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %89.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %94.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %99.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %104.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %139.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %185.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %166.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %113.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %118.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %123.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
     ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, implicit $exec :: (load (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: undef %115.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %119.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %123.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %127.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %127, %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %128.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %133.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %144.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %149.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
     ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: undef %138.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %142.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %146.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %150.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %150, %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5)
     ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %156.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
     ; CHECK-NEXT: undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
     ; CHECK-NEXT: undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
     ; CHECK-NEXT: undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
@@ -80,73 +62,141 @@ body:             |
     ; CHECK-NEXT: undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
     ; CHECK-NEXT: undef %43.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
     ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE2]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE3]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE4]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE5]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE6]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE7]], %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %131.sub2:vreg_128 = COPY %87.sub2
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %131, %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE8]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %134.sub2:vreg_128 = COPY %91.sub2
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %134, %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE9]], %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE10]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: %19.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE11]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE11]], %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %103.sub2:vreg_128 = COPY %102.sub2
-    ; CHECK-NEXT: %103.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE12:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE12]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE12]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %112.sub2:vreg_128 = COPY %111.sub2
-    ; CHECK-NEXT: %112.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %116.sub2:vreg_128 = COPY %115.sub2
-    ; CHECK-NEXT: %116.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %120.sub2:vreg_128 = COPY %119.sub2
-    ; CHECK-NEXT: %120.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %48.sub2:vreg_128 = COPY %47.sub2
+    ; CHECK-NEXT: %48.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %50.sub0:vreg_128 = COPY %48.sub0 {
+    ; CHECK-NEXT:   internal %50.sub2:vreg_128 = COPY %48.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %50, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %55.sub2:vreg_128 = COPY %54.sub2
+    ; CHECK-NEXT: %55.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %57.sub0:vreg_128 = COPY %55.sub0 {
+    ; CHECK-NEXT:   internal %57.sub2:vreg_128 = COPY %55.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %57, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %62.sub2:vreg_128 = COPY %61.sub2
+    ; CHECK-NEXT: %62.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %64.sub0:vreg_128 = COPY %62.sub0 {
+    ; CHECK-NEXT:   internal %64.sub2:vreg_128 = COPY %62.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %64, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %69.sub2:vreg_128 = COPY %68.sub2
+    ; CHECK-NEXT: %69.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %71.sub0:vreg_128 = COPY %69.sub0 {
+    ; CHECK-NEXT:   internal %71.sub2:vreg_128 = COPY %69.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %71, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %76.sub2:vreg_128 = COPY %75.sub2
+    ; CHECK-NEXT: %76.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %78.sub0:vreg_128 = COPY %76.sub0 {
+    ; CHECK-NEXT:   internal %78.sub2:vreg_128 = COPY %76.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %78, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %83.sub2:vreg_128 = COPY %82.sub2
+    ; CHECK-NEXT: %83.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %85.sub0:vreg_128 = COPY %83.sub0 {
+    ; CHECK-NEXT:   internal %85.sub2:vreg_128 = COPY %83.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %85, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %90.sub2:vreg_128 = COPY %89.sub2
+    ; CHECK-NEXT: %90.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %140.sub0:vreg_128 = COPY %90.sub0 {
+    ; CHECK-NEXT:   internal %140.sub2:vreg_128 = COPY %90.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %140, %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %95.sub2:vreg_128 = COPY %94.sub2
+    ; CHECK-NEXT: %95.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %107.sub0:vreg_128 = COPY %95.sub0 {
+    ; CHECK-NEXT:   internal %107.sub2:vreg_128 = COPY %95.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %107, %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %100.sub2:vreg_128 = COPY %99.sub2
+    ; CHECK-NEXT: %100.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %101.sub0:vreg_128 = COPY %100.sub0 {
+    ; CHECK-NEXT:   internal %101.sub2:vreg_128 = COPY %100.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %105.sub2:vreg_128 = COPY %104.sub2
+    ; CHECK-NEXT: %105.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %106.sub0:vreg_128 = COPY %105.sub0 {
+    ; CHECK-NEXT:   internal %106.sub2:vreg_128 = COPY %105.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %139.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %158.sub0:vreg_128 = COPY %139.sub0 {
+    ; CHECK-NEXT:   internal %158.sub2:vreg_128 = COPY %139.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %158, %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %186.sub2:vreg_128 = COPY %185.sub2
+    ; CHECK-NEXT: %186.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %188.sub0:vreg_128 = COPY %186.sub0 {
+    ; CHECK-NEXT:   internal %188.sub2:vreg_128 = COPY %186.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %188, %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %167.sub2:vreg_128 = COPY %166.sub2
+    ; CHECK-NEXT: %167.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %169.sub0:vreg_128 = COPY %167.sub0 {
+    ; CHECK-NEXT:   internal %169.sub2:vreg_128 = COPY %167.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %169, %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %114.sub2:vreg_128 = COPY %113.sub2
+    ; CHECK-NEXT: %114.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %115.sub0:vreg_128 = COPY %114.sub0 {
+    ; CHECK-NEXT:   internal %115.sub2:vreg_128 = COPY %114.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %119.sub2:vreg_128 = COPY %118.sub2
+    ; CHECK-NEXT: %119.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %181.sub0:vreg_128 = COPY %119.sub0 {
+    ; CHECK-NEXT:   internal %181.sub2:vreg_128 = COPY %119.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %181, %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
     ; CHECK-NEXT: undef %124.sub2:vreg_128 = COPY %123.sub2
-    ; CHECK-NEXT: %124.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE13:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE13]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE13]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %139.sub2:vreg_128 = COPY %138.sub2
-    ; CHECK-NEXT: %139.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %143.sub2:vreg_128 = COPY %142.sub2
-    ; CHECK-NEXT: %143.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %147.sub2:vreg_128 = COPY %146.sub2
-    ; CHECK-NEXT: %147.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE14:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE14]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE14]], %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5)
-    ; CHECK-NEXT: %156.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK-NEXT: %124.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %125.sub0:vreg_128 = COPY %124.sub0 {
+    ; CHECK-NEXT:   internal %125.sub2:vreg_128 = COPY %124.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %129.sub2:vreg_128 = COPY %128.sub2
+    ; CHECK-NEXT: %129.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %130.sub0:vreg_128 = COPY %129.sub0 {
+    ; CHECK-NEXT:   internal %130.sub2:vreg_128 = COPY %129.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %134.sub2:vreg_128 = COPY %133.sub2
+    ; CHECK-NEXT: %134.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %135.sub0:vreg_128 = COPY %134.sub0 {
+    ; CHECK-NEXT:   internal %135.sub2:vreg_128 = COPY %134.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %145.sub2:vreg_128 = COPY %144.sub2
+    ; CHECK-NEXT: %145.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %146.sub0:vreg_128 = COPY %145.sub0 {
+    ; CHECK-NEXT:   internal %146.sub2:vreg_128 = COPY %145.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %150.sub2:vreg_128 = COPY %149.sub2
+    ; CHECK-NEXT: %150.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %151.sub0:vreg_128 = COPY %150.sub0 {
+    ; CHECK-NEXT:   internal %151.sub2:vreg_128 = COPY %150.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %157.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %155.sub2:vreg_128 = COPY %157.sub2
+    ; CHECK-NEXT: %155.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %156.sub0:vreg_128 = COPY %155.sub0 {
+    ; CHECK-NEXT:   internal %156.sub2:vreg_128 = COPY %155.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %165.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %163.sub2:vreg_128 = COPY %165.sub2
+    ; CHECK-NEXT: %163.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %164.sub0:vreg_128 = COPY %163.sub0 {
+    ; CHECK-NEXT:   internal %164.sub2:vreg_128 = COPY %163.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %176.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %174.sub2:vreg_128 = COPY %176.sub2
+    ; CHECK-NEXT: %174.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %175.sub0:vreg_128 = COPY %174.sub0 {
+    ; CHECK-NEXT:   internal %175.sub2:vreg_128 = COPY %174.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %195.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %180.sub2:vreg_128 = COPY %195.sub2
+    ; CHECK-NEXT: %180.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %194.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %193.sub2:vreg_128 = COPY %194.sub2
+    ; CHECK-NEXT: %193.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
     ; CHECK-NEXT: %36.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
     ; CHECK-NEXT: %37.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
     ; CHECK-NEXT: %38.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
@@ -175,164 +225,164 @@ body:             |
     ; CHECK-NEXT: %36.sub1:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: %36.sub3:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK-NEXT: undef %157.sub0:vreg_128 = COPY %156.sub0 {
-    ; CHECK-NEXT:   internal %157.sub2:vreg_128 = COPY %156.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %157.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %157.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %157, %2, 0, 400, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %149.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub0 {
-    ; CHECK-NEXT:   internal %149.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %149.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %149.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %149, %2, 0, 352, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %145.sub0:vreg_128 = COPY %147.sub0 {
-    ; CHECK-NEXT:   internal %145.sub2:vreg_128 = COPY %147.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %145.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %145.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %145, %2, 0, 368, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %141.sub0:vreg_128 = COPY %143.sub0 {
-    ; CHECK-NEXT:   internal %141.sub2:vreg_128 = COPY %143.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %141.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %141.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %141, %2, 0, 320, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: undef %137.sub0:vreg_128 = COPY %139.sub0 {
-    ; CHECK-NEXT:   internal %137.sub2:vreg_128 = COPY %139.sub2
+    ; CHECK-NEXT: undef %191.sub0:vreg_128 = COPY %193.sub0 {
+    ; CHECK-NEXT:   internal %191.sub2:vreg_128 = COPY %193.sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %137.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %137.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %137, %2, 0, 336, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %126.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub0 {
-    ; CHECK-NEXT:   internal %126.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub2
+    ; CHECK-NEXT: %191.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %191.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %191, %2, 0, 400, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %178.sub0:vreg_128 = COPY %180.sub0 {
+    ; CHECK-NEXT:   internal %178.sub2:vreg_128 = COPY %180.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %178.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %178.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %178, %2, 0, 352, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef %172.sub0:vreg_128 = COPY %175.sub0 {
+    ; CHECK-NEXT:   internal %172.sub2:vreg_128 = COPY %175.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %172.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %172.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %172, %2, 0, 368, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %161.sub0:vreg_128 = COPY %164.sub0 {
+    ; CHECK-NEXT:   internal %161.sub2:vreg_128 = COPY %164.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %161.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %161.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %161, %2, 0, 320, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: undef %153.sub0:vreg_128 = COPY %156.sub0 {
+    ; CHECK-NEXT:   internal %153.sub2:vreg_128 = COPY %156.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %153.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %153.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %153, %2, 0, 336, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %148.sub0:vreg_128 = COPY %151.sub0 {
+    ; CHECK-NEXT:   internal %148.sub2:vreg_128 = COPY %151.sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %126.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %126.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %126, %2, 0, 288, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %122.sub0:vreg_128 = COPY %124.sub0 {
-    ; CHECK-NEXT:   internal %122.sub2:vreg_128 = COPY %124.sub2
+    ; CHECK-NEXT: %148.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %148.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %148, %2, 0, 288, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef %143.sub0:vreg_128 = COPY %146.sub0 {
+    ; CHECK-NEXT:   internal %143.sub2:vreg_128 = COPY %146.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %143.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %143.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %143, %2, 0, 304, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %132.sub0:vreg_128 = COPY %135.sub0 {
+    ; CHECK-NEXT:   internal %132.sub2:vreg_128 = COPY %135.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %132.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %132.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %132, %2, 0, 256, 0, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
+    ; CHECK-NEXT: undef %127.sub0:vreg_128 = COPY %130.sub0 {
+    ; CHECK-NEXT:   internal %127.sub2:vreg_128 = COPY %130.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %127.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %127.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %127, %2, 0, 272, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %122.sub0:vreg_128 = COPY %125.sub0 {
+    ; CHECK-NEXT:   internal %122.sub2:vreg_128 = COPY %125.sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: %122.sub1:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: %122.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %122, %2, 0, 304, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %118.sub0:vreg_128 = COPY %120.sub0 {
-    ; CHECK-NEXT:   internal %118.sub2:vreg_128 = COPY %120.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %118.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %118.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %118, %2, 0, 256, 0, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
-    ; CHECK-NEXT: undef %114.sub0:vreg_128 = COPY %116.sub0 {
-    ; CHECK-NEXT:   internal %114.sub2:vreg_128 = COPY %116.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %114.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %114.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %114, %2, 0, 272, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %110.sub0:vreg_128 = COPY %112.sub0 {
-    ; CHECK-NEXT:   internal %110.sub2:vreg_128 = COPY %112.sub2
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %122, %2, 0, 224, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %117.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub0 {
+    ; CHECK-NEXT:   internal %117.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %117.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %117.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %117, %2, 0, 240, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %112.sub0:vreg_128 = COPY %115.sub0 {
+    ; CHECK-NEXT:   internal %112.sub2:vreg_128 = COPY %115.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %112.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %112.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 192, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %110.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub0 {
+    ; CHECK-NEXT:   internal %110.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: %110.sub1:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: %110.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %110, %2, 0, 224, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %105.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub0 {
-    ; CHECK-NEXT:   internal %105.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %105.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %105.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %105, %2, 0, 240, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %101.sub0:vreg_128 = COPY %103.sub0 {
-    ; CHECK-NEXT:   internal %101.sub2:vreg_128 = COPY %103.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %101.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %101.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %101, %2, 0, 192, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %99.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub0 {
-    ; CHECK-NEXT:   internal %99.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %99.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %99.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 208, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: %19.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %19.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %19, %2, 0, 160, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %94.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub0 {
-    ; CHECK-NEXT:   internal %94.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %94.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %94.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 176, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %90.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub0 {
-    ; CHECK-NEXT:   internal %90.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %90.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %90.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %90, %2, 0, 128, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %86.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub0 {
-    ; CHECK-NEXT:   internal %86.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %86.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %86.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %86, %2, 0, 144, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %81.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub0 {
-    ; CHECK-NEXT:   internal %81.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub2
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %110, %2, 0, 208, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %184.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub0 {
+    ; CHECK-NEXT:   internal %184.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %184.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %184.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %184, %2, 0, 160, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %137.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub0 {
+    ; CHECK-NEXT:   internal %137.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %137.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %137.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %137, %2, 0, 176, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %103.sub0:vreg_128 = COPY %106.sub0 {
+    ; CHECK-NEXT:   internal %103.sub2:vreg_128 = COPY %106.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %103.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %103.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %103, %2, 0, 128, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+    ; CHECK-NEXT: undef %98.sub0:vreg_128 = COPY %101.sub0 {
+    ; CHECK-NEXT:   internal %98.sub2:vreg_128 = COPY %101.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %98.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %98.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %98, %2, 0, 144, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %93.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub0 {
+    ; CHECK-NEXT:   internal %93.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %93.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %93.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %93, %2, 0, 96, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %88.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub0 {
+    ; CHECK-NEXT:   internal %88.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %88.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %88.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %88, %2, 0, 112, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %81.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub0 {
+    ; CHECK-NEXT:   internal %81.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: %81.sub1:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: %81.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %81, %2, 0, 96, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE23:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %76.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub0 {
-    ; CHECK-NEXT:   internal %76.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %76.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %76.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %76, %2, 0, 112, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE24:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %71.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub0 {
-    ; CHECK-NEXT:   internal %71.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %71.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %71.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %71, %2, 0, 64, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE25:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %66.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub0 {
-    ; CHECK-NEXT:   internal %66.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %66.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %66.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %66, %2, 0, 80, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE26:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %61.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub0 {
-    ; CHECK-NEXT:   internal %61.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %61.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %61.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %61, %2, 0, 32, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE27:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %56.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub0 {
-    ; CHECK-NEXT:   internal %56.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %56.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %56.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %56, %2, 0, 48, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE28:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %51.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub0 {
-    ; CHECK-NEXT:   internal %51.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %51.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %51.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %51, %2, 0, 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE29:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %46.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub0 {
-    ; CHECK-NEXT:   internal %46.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub2
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %81, %2, 0, 64, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %74.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub0 {
+    ; CHECK-NEXT:   internal %74.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %74.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %74.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %74, %2, 0, 80, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %67.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub0 {
+    ; CHECK-NEXT:   internal %67.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %67.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %67.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %67, %2, 0, 32, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %60.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub0 {
+    ; CHECK-NEXT:   internal %60.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %60.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %60.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %60, %2, 0, 48, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %53.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub0 {
+    ; CHECK-NEXT:   internal %53.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %53.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %53.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %53, %2, 0, 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %46.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub0 {
+    ; CHECK-NEXT:   internal %46.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: %46.sub1:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: %46.sub3:vreg_128 = COPY %43.sub1

diff  --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
new file mode 100644
index 0000000000000..b1180805c55da
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -0,0 +1,1881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -greedy-regclass-priority-trumps-globalness=1 -o - %s | FileCheck -check-prefixes=GFX90A,GLOBALNESS1 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -greedy-regclass-priority-trumps-globalness=0 -o - %s | FileCheck -check-prefixes=GFX90A,GLOBALNESS0 %s
+
+declare void @wobble()
+
+define internal fastcc void @widget() {
+; GFX90A-LABEL: widget:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX90A-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX90A-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX90A-NEXT:    s_mov_b32 s33, s32
+; GFX90A-NEXT:    s_addk_i32 s32, 0x400
+; GFX90A-NEXT:    s_getpc_b64 s[16:17]
+; GFX90A-NEXT:    s_add_u32 s16, s16, wobble at gotpcrel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s17, s17, wobble at gotpcrel32@hi+12
+; GFX90A-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX90A-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX90A-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+bb:
+  tail call void @wobble()
+  unreachable
+}
+
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, i32 %tmp5.i.i, i32 %tmp427.i, i1 %tmp438.i, double %tmp27.i, i1 %tmp48.i) {
+; GLOBALNESS1-LABEL: kernel:
+; GLOBALNESS1:       ; %bb.0: ; %bb
+; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, v0
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v44, 0
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[56:57], s[8:9], 0x8
+; GLOBALNESS1-NEXT:    s_nop 0
+; GLOBALNESS1-NEXT:    s_load_dword s8, s[8:9], 0x14
+; GLOBALNESS1-NEXT:    s_nop 0
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[6:7], s[38:39], 0x18
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v44, off
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    global_load_dword v0, v44, s[4:5]
+; GLOBALNESS1-NEXT:    s_mov_b32 s61, 0
+; GLOBALNESS1-NEXT:    s_mov_b32 s60, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s62, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s63, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s64, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s65, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s66, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s67, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s69, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s71, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s73, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s75, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s77, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s79, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s84, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s85, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s86, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s87, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s88, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s89, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s90, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s91, s61
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a32, s60
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a33, s61
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a34, s62
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a35, s63
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a36, s64
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a37, s65
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a38, s66
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a39, s67
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a40, s68
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a41, s69
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a42, s70
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a43, s71
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a44, s72
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a45, s73
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a46, s74
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a47, s75
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a48, s76
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a49, s77
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a50, s78
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a51, s79
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a52, s80
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a53, s81
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a54, s82
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a55, s83
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a56, s84
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a57, s85
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a58, s86
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a59, s87
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a60, s88
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a61, s89
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a62, s90
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a63, s91
+; GLOBALNESS1-NEXT:    s_movk_i32 s60, 0x80
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s60, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s61, 1
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s62, 2
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s63, 3
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s64, 4
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s65, 5
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s66, 6
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s67, 7
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s68, 8
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s69, 9
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s70, 10
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s71, 11
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s72, 12
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s73, 13
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s74, 14
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s75, 15
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s76, 16
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s77, 17
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s78, 18
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s79, 19
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s80, 20
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s81, 21
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s82, 22
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s83, 23
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s84, 24
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s85, 25
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s86, 26
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s87, 27
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s88, 28
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s89, 29
+; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, 0x40994400
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s90, 30
+; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s91, 31
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], v[44:45]
+; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 32
+; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 33
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], 0
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s56, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 34
+; GLOBALNESS1-NEXT:    s_load_dword s9, s[38:39], 0x20
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 35
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:    s_xor_b64 s[46:47], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s8, 0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:    s_xor_b64 s[50:51], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s9, 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:    s_getpc_b64 s[6:7]
+; GLOBALNESS1-NEXT:    s_add_u32 s6, s6, wobble at gotpcrel32@lo+4
+; GLOBALNESS1-NEXT:    s_addc_u32 s7, s7, wobble at gotpcrel32@hi+12
+; GLOBALNESS1-NEXT:    s_xor_b64 s[52:53], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 36
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[42:43], s[6:7], 0x0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 37
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 38
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 39
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 40
+; GLOBALNESS1-NEXT:    s_mov_b32 s100, s16
+; GLOBALNESS1-NEXT:    s_mov_b32 s101, s15
+; GLOBALNESS1-NEXT:    s_mov_b32 s44, s14
+; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v1
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[48:49], 1, v0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 41
+; GLOBALNESS1-NEXT:    s_mov_b32 s45, 0x3ff00000
+; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
+; GLOBALNESS1-NEXT:    s_branch .LBB1_4
+; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v41, 40
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v41, 41
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_29
+; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow6
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
+; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow19
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a63, v31
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v41, 42
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a62, v30
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a61, v29
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a60, v28
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a59, v27
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a58, v26
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a57, v25
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a56, v24
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a55, v23
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a54, v22
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a53, v21
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a52, v20
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a51, v19
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a50, v18
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a49, v17
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a48, v16
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a47, v15
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a46, v14
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a45, v13
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a44, v12
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a43, v11
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a42, v10
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a41, v9
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a40, v8
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a39, v7
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a38, v6
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a37, v5
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a36, v4
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a35, v3
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a34, v2
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a33, v1
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a32, v0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v41, 43
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_30
+; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
+; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
+; GLOBALNESS1-NEXT:    ; Child Loop BB1_17 Depth 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s60, v41, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s61, v41, 1
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS1-NEXT:    buffer_store_dword v44, off, s[0:3], 0
+; GLOBALNESS1-NEXT:    flat_load_dword v43, v[0:1]
+; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s44
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s101
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s100
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 3
+; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s66, v41, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s67, v41, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 9
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 10
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 11
+; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 12
+; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 13
+; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 14
+; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 15
+; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 16
+; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 17
+; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 18
+; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 19
+; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 20
+; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 21
+; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 22
+; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 23
+; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 24
+; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 25
+; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 26
+; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 27
+; GLOBALNESS1-NEXT:    v_readlane_b32 s88, v41, 28
+; GLOBALNESS1-NEXT:    v_readlane_b32 s89, v41, 29
+; GLOBALNESS1-NEXT:    v_readlane_b32 s90, v41, 30
+; GLOBALNESS1-NEXT:    v_readlane_b32 s91, v41, 31
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[58:59]
+; GLOBALNESS1-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_10
+; GLOBALNESS1-NEXT:  ; %bb.5: ; %NodeBlock
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], -1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s57, 1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_7
+; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock3
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s57, 1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:  .LBB1_7: ; %Flow17
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS1-NEXT:  ; %bb.8: ; %LeafBlock
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s57, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow18
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s8, 42
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s9, 43
+; GLOBALNESS1-NEXT:  .LBB1_10: ; %Flow16
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[60:61], s[68:69]
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 3
+; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 9
+; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 10
+; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 11
+; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 12
+; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 13
+; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 14
+; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 15
+; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 16
+; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 17
+; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 18
+; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 19
+; GLOBALNESS1-NEXT:    v_readlane_b32 s88, v41, 20
+; GLOBALNESS1-NEXT:    v_readlane_b32 s89, v41, 21
+; GLOBALNESS1-NEXT:    v_readlane_b32 s90, v41, 22
+; GLOBALNESS1-NEXT:    v_readlane_b32 s91, v41, 23
+; GLOBALNESS1-NEXT:    v_readlane_b32 s92, v41, 24
+; GLOBALNESS1-NEXT:    v_readlane_b32 s93, v41, 25
+; GLOBALNESS1-NEXT:    v_readlane_b32 s94, v41, 26
+; GLOBALNESS1-NEXT:    v_readlane_b32 s95, v41, 27
+; GLOBALNESS1-NEXT:    v_readlane_b32 s96, v41, 28
+; GLOBALNESS1-NEXT:    v_readlane_b32 s97, v41, 29
+; GLOBALNESS1-NEXT:    v_readlane_b32 s98, v41, 30
+; GLOBALNESS1-NEXT:    v_readlane_b32 s99, v41, 31
+; GLOBALNESS1-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s69, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s71, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s73, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s75, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s77, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s79, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s84, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s85, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s86, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s87, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s88, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s89, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s90, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s91, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s92, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s93, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s94, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s95, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s96, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s97, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s98, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s99, s61
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_3
+; GLOBALNESS1-NEXT:  ; %bb.11: ; %baz.exit.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    flat_load_dword v0, v[0:1]
+; GLOBALNESS1-NEXT:    v_readlane_b32 s60, v41, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s61, v41, 1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s66, v41, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s67, v41, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 9
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 10
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 11
+; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 12
+; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 13
+; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 14
+; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 15
+; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 16
+; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 17
+; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 18
+; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 19
+; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 20
+; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 21
+; GLOBALNESS1-NEXT:    s_mov_b32 s65, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s64, s61
+; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 22
+; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 23
+; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 24
+; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 25
+; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 26
+; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 27
+; GLOBALNESS1-NEXT:    v_readlane_b32 s88, v41, 28
+; GLOBALNESS1-NEXT:    v_readlane_b32 s89, v41, 29
+; GLOBALNESS1-NEXT:    v_readlane_b32 s90, v41, 30
+; GLOBALNESS1-NEXT:    v_readlane_b32 s91, v41, 31
+; GLOBALNESS1-NEXT:    s_mov_b32 s66, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s67, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s69, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s71, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s73, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s75, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s77, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s79, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s45
+; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 3
+; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[54:55], 0, v0
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[64:65], s[64:65] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], s[66:67], s[66:67] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[4:5], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[6:7], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[8:9], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[10:11], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[12:13], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[14:15], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[16:17], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[18:19], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[20:21], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[22:23], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[24:25], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[26:27], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[28:29], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[30:31], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[90:91], s[54:55]
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
+; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb33.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
+; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v41, 36
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v41, 37
+; GLOBALNESS1-NEXT:    s_mov_b64 s[92:93], s[58:59]
+; GLOBALNESS1-NEXT:    s_mov_b32 s89, s57
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
+; GLOBALNESS1-NEXT:  ; %bb.13: ; %bb39.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[44:45], off
+; GLOBALNESS1-NEXT:  .LBB1_14: ; %bb44.lr.ph.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v43
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
+; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 32
+; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 34
+; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
+; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1]
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[58:59], 0, v2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 33
+; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 35
+; GLOBALNESS1-NEXT:    s_branch .LBB1_17
+; GLOBALNESS1-NEXT:  .LBB1_15: ; %Flow7
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb63.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[52:53]
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
+; GLOBALNESS1-NEXT:  .LBB1_17: ; %bb44.i
+; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[46:47]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb46.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[50:51]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb50.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[62:63]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_22
+; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb3.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[64:65]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_22
+; GLOBALNESS1-NEXT:  ; %bb.21: ; %bb6.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[56:57]
+; GLOBALNESS1-NEXT:  .LBB1_22: ; %spam.exit.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[48:49]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb55.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_add_u32 s60, s38, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s61, s39, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[60:61]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s44
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s101
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s100
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[60:61]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s44
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s101
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s100
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[0:1], a[32:33], off
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[58:59]
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_15
+; GLOBALNESS1-NEXT:  ; %bb.24: ; %bb62.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[0:1], v[44:45], off
+; GLOBALNESS1-NEXT:    s_branch .LBB1_15
+; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow14
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s56, v41, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s57, v41, 1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS1-NEXT:    s_mov_b32 s68, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s69, s57
+; GLOBALNESS1-NEXT:    v_readlane_b32 s58, v41, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s59, v41, 3
+; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS1-NEXT:    s_mov_b32 s70, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s71, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s72, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s73, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s74, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s75, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s76, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s77, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s78, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s79, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s84, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s85, s57
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_readlane_b32 s60, v41, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s61, v41, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS1-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS1-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    s_mov_b32 s57, s89
+; GLOBALNESS1-NEXT:    s_mov_b64 s[58:59], s[92:93]
+; GLOBALNESS1-NEXT:  .LBB1_26: ; %Flow15
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[90:91]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[54:55]
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
+; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb67.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v41, 38
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v41, 39
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
+; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS1-NEXT:    s_branch .LBB1_1
+; GLOBALNESS1-NEXT:  .LBB1_29: ; %bb73.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS1-NEXT:    s_branch .LBB1_2
+; GLOBALNESS1-NEXT:  .LBB1_30: ; %loop.exit.guard
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_32
+; GLOBALNESS1-NEXT:  ; %bb.31: ; %bb7.i.i
+; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s44
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s101
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s100
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
+; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
+; GLOBALNESS1-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS1-NEXT:  .LBB1_32: ; %Flow
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_34
+; GLOBALNESS1-NEXT:  ; %bb.33: ; %bb11.i.i
+; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s44
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s101
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s100
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
+; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
+; GLOBALNESS1-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GLOBALNESS1-NEXT:  .LBB1_34: ; %UnifiedUnreachableBlock
+;
+; GLOBALNESS0-LABEL: kernel:
+; GLOBALNESS0:       ; %bb.0: ; %bb
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s16, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s15, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s10, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s11, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s6, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s7, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 6
+; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 7
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v44, 0
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[56:57], s[8:9], 0x8
+; GLOBALNESS0-NEXT:    s_nop 0
+; GLOBALNESS0-NEXT:    s_load_dword s8, s[8:9], 0x14
+; GLOBALNESS0-NEXT:    s_nop 0
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[6:7], s[38:39], 0x18
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v44, off
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    global_load_dword v0, v44, s[4:5]
+; GLOBALNESS0-NEXT:    s_mov_b32 s61, 0
+; GLOBALNESS0-NEXT:    s_mov_b32 s60, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s62, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s63, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s64, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s65, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s66, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s67, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s69, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s71, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s73, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s75, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s77, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s79, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s81, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s82, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s83, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s84, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s85, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s86, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s87, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s88, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s89, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s90, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s91, s61
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a32, s60
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a33, s61
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a34, s62
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a35, s63
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a36, s64
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a37, s65
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a38, s66
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a39, s67
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a40, s68
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a41, s69
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a42, s70
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a43, s71
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a44, s72
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a45, s73
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a46, s74
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a47, s75
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a48, s76
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a49, s77
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a50, s78
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a51, s79
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a52, s80
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a53, s81
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a54, s82
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a55, s83
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a56, s84
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a57, s85
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a58, s86
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a59, s87
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a60, s88
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a61, s89
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a62, s90
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a63, s91
+; GLOBALNESS0-NEXT:    s_movk_i32 s60, 0x80
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 11
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 12
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 13
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 15
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 16
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 17
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 18
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 19
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 20
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 21
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 22
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 23
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 24
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 25
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s78, 26
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s79, 27
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s80, 28
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s81, 29
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s82, 30
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s83, 31
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s84, 32
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s85, 33
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s86, 34
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s87, 35
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s88, 36
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s89, 37
+; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, 0x40994400
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s90, 38
+; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s91, 39
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], v[44:45]
+; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 41
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], 0
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s56, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 42
+; GLOBALNESS0-NEXT:    s_load_dword s9, s[38:39], 0x20
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 43
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:    s_xor_b64 s[36:37], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s8, 0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:    s_xor_b64 s[34:35], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s9, 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:    s_getpc_b64 s[6:7]
+; GLOBALNESS0-NEXT:    s_add_u32 s6, s6, wobble at gotpcrel32@lo+4
+; GLOBALNESS0-NEXT:    s_addc_u32 s7, s7, wobble at gotpcrel32@hi+12
+; GLOBALNESS0-NEXT:    s_xor_b64 s[100:101], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 44
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 45
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[4:5], 1, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 46
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 47
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 48
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 49
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 50
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 51
+; GLOBALNESS0-NEXT:    s_mov_b32 s45, 0x3ff00000
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s44, 52
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s56, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s57, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s58, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s59, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s60, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s61, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s62, 6
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s63, 7
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s64, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s65, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s45, 53
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s66, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s46, 54
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s67, 11
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s47, 55
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s68, 12
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s48, 56
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s69, 13
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s49, 57
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s70, 14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s50, 58
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s71, 15
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s51, 59
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s72, 16
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[42:43], s[6:7], 0x0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s52, 60
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s73, 17
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s53, 61
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s74, 18
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s54, 62
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s75, 19
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s55, 63
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s56, 20
+; GLOBALNESS0-NEXT:    s_mov_b32 s33, s14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s57, 21
+; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
+; GLOBALNESS0-NEXT:    s_branch .LBB1_4
+; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 50
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 51
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_29
+; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow6
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
+; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow19
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a63, v31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v42, 22
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a62, v30
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a61, v29
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a60, v28
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a59, v27
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a58, v26
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a57, v25
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a56, v24
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a55, v23
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a54, v22
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a53, v21
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a52, v20
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a51, v19
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a50, v18
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a49, v17
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a48, v16
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a47, v15
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a46, v14
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a45, v13
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a44, v12
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a43, v11
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a42, v10
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a41, v9
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a40, v8
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a39, v7
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a38, v6
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a37, v5
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a36, v4
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a35, v3
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a34, v2
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a33, v1
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a32, v0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v42, 23
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_30
+; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
+; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
+; GLOBALNESS0-NEXT:    ; Child Loop BB1_17 Depth 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS0-NEXT:    buffer_store_dword v44, off, s[0:3], 0
+; GLOBALNESS0-NEXT:    flat_load_dword v46, v[0:1]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s10, v41, 2
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s11, v41, 3
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s13, v41, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s14, v41, 0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[58:59]
+; GLOBALNESS0-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_10
+; GLOBALNESS0-NEXT:  ; %bb.5: ; %NodeBlock
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], -1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s57, 1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_7
+; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock3
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s57, 1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:  .LBB1_7: ; %Flow17
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS0-NEXT:  ; %bb.8: ; %LeafBlock
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s57, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow18
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s8, 22
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s9, 23
+; GLOBALNESS0-NEXT:  .LBB1_10: ; %Flow16
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s69, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s71, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s73, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s75, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s77, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s79, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s81, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s82, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s83, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s84, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s85, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s86, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s87, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s88, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s89, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s90, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s91, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s92, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s93, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s94, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s95, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s96, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s97, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s98, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s99, s61
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_3
+; GLOBALNESS0-NEXT:  ; %bb.11: ; %baz.exit.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS0-NEXT:    flat_load_dword v0, v[0:1]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 52
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 53
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 54
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 55
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 56
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 57
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 58
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 59
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 60
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 61
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 62
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 63
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v42, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v42, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v42, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v42, 3
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v42, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v42, 5
+; GLOBALNESS0-NEXT:    s_mov_b32 s64, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s66, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s67, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s69, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s71, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s73, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s75, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s45, s65
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v42, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v42, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v42, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v42, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v42, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v42, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v42, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v42, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v42, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v42, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v42, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v42, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v42, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v42, 19
+; GLOBALNESS0-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s77, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s79, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s81, s65
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s44, 52
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s56, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s57, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s58, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s59, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s60, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s61, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s62, 6
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s63, 7
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s64, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s45, 53
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s65, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s46, 54
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s66, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s47, 55
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s67, 11
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s48, 56
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s68, 12
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s49, 57
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s69, 13
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s50, 58
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s70, 14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s51, 59
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s71, 15
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s52, 60
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s72, 16
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s53, 61
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s73, 17
+; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[96:97], 0, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s54, 62
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s74, 18
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[64:65], s[64:65] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s55, 63
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s75, 19
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[66:67], s[66:67] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[40:41], s[96:97]
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
+; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb33.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
+; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 44
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 45
+; GLOBALNESS0-NEXT:    s_mov_b64 s[98:99], s[58:59]
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
+; GLOBALNESS0-NEXT:  ; %bb.13: ; %bb39.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[44:45], off
+; GLOBALNESS0-NEXT:  .LBB1_14: ; %bb44.lr.ph.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 40
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 42
+; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
+; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1]
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[58:59], 0, v2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 41
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 43
+; GLOBALNESS0-NEXT:    s_branch .LBB1_17
+; GLOBALNESS0-NEXT:  .LBB1_15: ; %Flow7
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb63.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[100:101]
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
+; GLOBALNESS0-NEXT:  .LBB1_17: ; %bb44.i
+; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb46.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb50.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[60:61]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_22
+; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb3.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[62:63]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_22
+; GLOBALNESS0-NEXT:  ; %bb.21: ; %bb6.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[56:57]
+; GLOBALNESS0-NEXT:  .LBB1_22: ; %spam.exit.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 46
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 47
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb55.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_add_u32 s64, s38, 40
+; GLOBALNESS0-NEXT:    v_readlane_b32 s46, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s48, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s44, v41, 2
+; GLOBALNESS0-NEXT:    s_addc_u32 s65, s39, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s47, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s49, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s45, v41, 3
+; GLOBALNESS0-NEXT:    v_readlane_b32 s50, v41, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s51, v41, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[46:47]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[48:49]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[44:45]
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s33
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s50
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s51
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[46:47]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[48:49]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[44:45]
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s33
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s50
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s51
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[0:1], a[32:33], off
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[58:59]
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_15
+; GLOBALNESS0-NEXT:  ; %bb.24: ; %bb62.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[0:1], v[44:45], off
+; GLOBALNESS0-NEXT:    s_branch .LBB1_15
+; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow14
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s56, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s57, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s58, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s59, v41, 11
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[56:57]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 17
+; GLOBALNESS0-NEXT:    s_mov_b32 s56, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s57, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s58, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s59, s49
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 19
+; GLOBALNESS0-NEXT:    s_mov_b32 s60, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s61, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s62, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s63, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s64, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s65, s49
+; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[56:57]
+; GLOBALNESS0-NEXT:    s_mov_b32 s66, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s67, s49
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[58:59]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[60:61], s[64:65]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 31
+; GLOBALNESS0-NEXT:    s_mov_b64 s[62:63], s[66:67]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 39
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s48, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s49, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s50, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s51, 11
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s52, 12
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s53, 13
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s54, 14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s55, 15
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s56, 16
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s57, 17
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s58, 18
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s59, 19
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 20
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 21
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 22
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 23
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 24
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 25
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 26
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 27
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 28
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 29
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 30
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 31
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 32
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 33
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 34
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 35
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 36
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 37
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s78, 38
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s79, 39
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b32 s64, s49
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[52:53]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[54:55]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[56:57]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[58:59]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b32 s60, s64
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[36:37]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[64:65]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS0-NEXT:    s_mov_b32 s61, s37
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[44:45], s[48:49]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
+; GLOBALNESS0-NEXT:    s_mov_b32 s62, s37
+; GLOBALNESS0-NEXT:    s_mov_b64 s[46:47], s[50:51]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[52:53]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[54:55]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[56:57]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[58:59]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
+; GLOBALNESS0-NEXT:    s_mov_b32 s58, s62
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[60:61]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b32 s59, s37
+; GLOBALNESS0-NEXT:    s_mov_b64 s[62:63], s[58:59]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS0-NEXT:    s_mov_b64 s[60:61], s[56:57]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[54:55]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[52:53]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[50:51]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[48:49]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[46:47]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[44:45]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[74:75], s[62:63]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS0-NEXT:    s_mov_b64 s[72:73], s[60:61]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[70:71], s[58:59]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[68:69], s[56:57]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[66:67], s[54:55]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[64:65], s[52:53]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[62:63], s[50:51]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[60:61], s[48:49]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b32 s76, s37
+; GLOBALNESS0-NEXT:    s_mov_b64 s[44:45], s[60:61]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[46:47], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[66:67]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[68:69]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[70:71]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[72:73]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[74:75]
+; GLOBALNESS0-NEXT:    s_mov_b32 s60, s76
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS0-NEXT:    s_mov_b32 s61, s65
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[44:45], s[44:45] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[46:47], s[46:47] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[48:49], s[48:49] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[50:51], s[50:51] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[52:53], s[52:53] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[54:55], s[54:55] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[56:57], s[56:57] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[58:59], s[58:59] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[60:61], s[60:61] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[62:63], s[62:63] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[64:65], s[64:65] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[66:67], s[66:67] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s56, v42, 20
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[4:5]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s57, v42, 21
+; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[98:99]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
+; GLOBALNESS0-NEXT:  .LBB1_26: ; %Flow15
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[40:41]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[96:97]
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
+; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb67.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 48
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 49
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
+; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS0-NEXT:    s_branch .LBB1_1
+; GLOBALNESS0-NEXT:  .LBB1_29: ; %bb73.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS0-NEXT:    s_branch .LBB1_2
+; GLOBALNESS0-NEXT:  .LBB1_30: ; %loop.exit.guard
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_32
+; GLOBALNESS0-NEXT:  ; %bb.31: ; %bb7.i.i
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s10, v41, 2
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s11, v41, 3
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s13, v41, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s14, v41, 0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
+; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
+; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
+; GLOBALNESS0-NEXT:    s_mov_b32 s34, s33
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GLOBALNESS0-NEXT:    s_mov_b32 s33, s34
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS0-NEXT:  .LBB1_32: ; %Flow
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_34
+; GLOBALNESS0-NEXT:  ; %bb.33: ; %bb11.i.i
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s10, v41, 2
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s11, v41, 3
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s13, v41, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s14, v41, 0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
+; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
+; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GLOBALNESS0-NEXT:  .LBB1_34: ; %UnifiedUnreachableBlock
+bb:
+  store i32 0, i32 addrspace(1)* null, align 4
+  %tmp4 = load i32, i32 addrspace(1)* %arg1.global, align 4
+  br label %bb5
+
+bb5:                                              ; preds = %bb5.backedge, %bb
+  %tmp4.i.sroa.0.0 = phi <9 x double> [ undef, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ]
+  %tmp14.1.i = load i32, i32* inttoptr (i64 128 to i32*), align 128
+  store i32 0, i32 addrspace(5)* null, align 4
+  %tmp14.2.i = load i32, i32* inttoptr (i64 128 to i32*), align 128
+  %tmp15.2.i = icmp eq i32 %tmp14.2.i, 0
+  %spec.select.2.i = select i1 %tmp15.2.i, i32 0, i32 %tmp14.1.i
+  tail call void @wobble()
+  br i1 %tmp3.i.i, label %bb4.i.i, label %baz.exit.i
+
+bb4.i.i:                                          ; preds = %bb5
+  switch i32 %tmp5.i.i, label %baz.exit.i [
+    i32 0, label %bb7.i.i
+    i32 1, label %bb11.i.i
+  ]
+
+bb7.i.i:                                          ; preds = %bb4.i.i
+  tail call fastcc void @widget()
+  unreachable
+
+bb11.i.i:                                         ; preds = %bb4.i.i
+  tail call fastcc void @widget()
+  unreachable
+
+baz.exit.i:                                       ; preds = %bb4.i.i, %bb5
+  %tmp26.i = load i32, i32* null, align 4
+  %tmp27.i4 = load double, double addrspace(1)* null, align 8
+  %tmp31.i = icmp slt i32 %tmp26.i, 0
+  br i1 %tmp31.i, label %bb33.i, label %bb64.i
+
+bb33.i:                                           ; preds = %baz.exit.i
+  %tmp38.i = icmp slt i32 %tmp4, 0
+  br i1 %tmp38.i, label %bb39.i, label %bb44.lr.ph.i
+
+bb39.i:                                           ; preds = %bb33.i
+  store double 0.000000e+00, double addrspace(1)* null, align 8
+  br label %bb44.lr.ph.i
+
+bb44.lr.ph.i:                                     ; preds = %bb39.i, %bb33.i
+  br label %bb44.i
+
+bb44.i:                                           ; preds = %bb63.i, %bb44.lr.ph.i
+  br i1 %tmp3.i.i, label %bb63.i, label %bb46.i
+
+bb46.i:                                           ; preds = %bb44.i
+  br i1 %tmp438.i, label %bb63.i, label %bb50.i
+
+bb50.i:                                           ; preds = %bb46.i
+  switch i32 0, label %spam.exit.i [
+    i32 0, label %bb1.i.i
+  ]
+
+bb1.i.i:                                          ; preds = %bb50.i
+  %tmp2.i.i = fcmp ogt double %tmp27.i, 1.617000e+03
+  br i1 %tmp2.i.i, label %spam.exit.i, label %bb3.i.i
+
+bb3.i.i:                                          ; preds = %bb1.i.i
+  %tmp4.i.i = fcmp ogt double %tmp27.i, 0.000000e+00
+  br i1 %tmp4.i.i, label %spam.exit.i, label %bb6.i.i
+
+bb6.i.i:                                          ; preds = %bb3.i.i
+  %tmp7.i.i = fcmp ogt double %tmp27.i4, 0.000000e+00
+  br i1 %tmp7.i.i, label %spam.exit.i, label %bb8.i.i
+
+bb8.i.i:                                          ; preds = %bb6.i.i
+  tail call void null()
+  br label %spam.exit.i
+
+spam.exit.i:                                      ; preds = %bb8.i.i, %bb6.i.i, %bb3.i.i, %bb1.i.i, %bb50.i
+  %tmp22.i = icmp sgt i32 %tmp4, 0
+  br i1 %tmp22.i, label %bb63.i, label %bb55.i
+
+bb55.i:                                           ; preds = %spam.exit.i
+  tail call void @wobble()
+  %tmp0 = extractelement <9 x double> %tmp4.i.sroa.0.0, i32 0
+  store double %tmp0, double addrspace(1)* null, align 8
+  tail call void @wobble()
+  %tmp61.i = icmp eq i32 %spec.select.2.i, 0
+  br i1 %tmp61.i, label %bb62.i, label %bb63.i
+
+bb62.i:                                           ; preds = %bb55.i
+  store double 0.000000e+00, double addrspace(1)* null, align 8
+  br label %bb63.i
+
+bb63.i:                                           ; preds = %bb62.i, %bb55.i, %spam.exit.i, %bb46.i, %bb44.i
+  br i1 %tmp48.i, label %bb44.i, label %bb64.i
+
+bb64.i:                                           ; preds = %bb63.i, %baz.exit.i
+  %tmp4.i.sroa.0.1 = phi <9 x double> [ <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %baz.exit.i ], [ zeroinitializer, %bb63.i ]
+  br i1 %tmp31.i, label %bb67.i, label %bb5.backedge
+
+bb5.backedge:                                     ; preds = %bb73.i, %bb70.i, %bb64.i
+  br label %bb5
+
+bb67.i:                                           ; preds = %bb64.i
+  %tmp68.i = icmp eq i32 %tmp4, 1
+  br i1 %tmp68.i, label %bb69.i, label %bb70.i
+
+bb69.i:                                           ; preds = %bb67.i
+  store double 0.000000e+00, double addrspace(1)* null, align 8
+  br label %bb70.i
+
+bb70.i:                                           ; preds = %bb69.i, %bb67.i
+  %tmp3.i.i2 = icmp eq i32 %tmp4, 0
+  br i1 %tmp3.i.i2, label %bb73.i, label %bb5.backedge
+
+bb73.i:                                           ; preds = %bb70.i
+  store double 0.000000e+00, double addrspace(1)* null, align 8
+  br label %bb5.backedge
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 8bc247d9ebaf3..2f59c59d1f199 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -181,13 +181,13 @@ define void @vst3_v16i32(<16 x i32> *%src, <48 x i32> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s0, s17
 ; CHECK-NEXT:    vmov.f32 s2, s14
 ; CHECK-NEXT:    vmov.f32 s3, s18
+; CHECK-NEXT:    vmov.f32 s21, s7
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s21, s7
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d0, d4
 ; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
 ; CHECK-NEXT:    vmov.f32 s22, s11
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d0, d4
 ; CHECK-NEXT:    vmov.f32 s19, s10
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s21, s7
@@ -200,44 +200,45 @@ define void @vst3_v16i32(<16 x i32> *%src, <48 x i32> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s18, s6
-; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s5, s27
-; CHECK-NEXT:    vmov.f32 s8, s24
-; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmov.f32 s9, s0
-; CHECK-NEXT:    vmov.f32 s24, s1
-; CHECK-NEXT:    vmov.f32 s27, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov r0, r3, d14
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
+; CHECK-NEXT:    vmov.f32 s8, s24
+; CHECK-NEXT:    vmov.f32 s9, s0
 ; CHECK-NEXT:    vmov.f32 s11, s25
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s20, s12
-; CHECK-NEXT:    vmov.32 q6[1], r3
 ; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vmov.32 q2[2], r0
 ; CHECK-NEXT:    vmov r0, lr, d14
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.32 q0[1], lr
+; CHECK-NEXT:    vmov.f32 s18, s6
 ; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov r2, r4, d14
+; CHECK-NEXT:    vmov.f64 d12, d14
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q5, [r1, #144]
+; CHECK-NEXT:    vmov.f32 s15, s5
+; CHECK-NEXT:    vmov.f32 s5, s27
+; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov.f32 s24, s1
+; CHECK-NEXT:    vstrw.32 q1, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s27, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov r2, r4, d14
+; CHECK-NEXT:    vmov.32 q6[1], r3
+; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.32 q3[2], r2
 ; CHECK-NEXT:    vmov.32 q4[1], r4
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vstrw.32 q1, [r1, #80]
+; CHECK-NEXT:    vmov.32 q0[1], lr
+; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #96]
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
-; CHECK-NEXT:    vstrw.32 q5, [r1, #144]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 q0[2], r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    add sp, #160
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index da969e01258d8..43ef891eab716 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -122,39 +122,29 @@ define void @vst4_v16i32(<16 x i32> *%src, <64 x i32> *%dst) {
 ; CHECK-NEXT:    sub sp, #192
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #144]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
 ; CHECK-NEXT:    vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #240]
-; CHECK-NEXT:    vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #240]
+; CHECK-NEXT:    vmov q6, q4
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    add r2, sp, #128
+; CHECK-NEXT:    vmov q7, q5
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #64
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
@@ -922,39 +912,29 @@ define void @vst4_v16f32(<16 x float> *%src, <64 x float> *%dst) {
 ; CHECK-NEXT:    sub sp, #192
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #144]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
 ; CHECK-NEXT:    vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #240]
-; CHECK-NEXT:    vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #240]
+; CHECK-NEXT:    vmov q6, q4
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    add r2, sp, #128
+; CHECK-NEXT:    vmov q7, q5
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #64
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]


        


More information about the llvm-commits mailing list