[llvm] 70cb57d - AMDGPU/GlobalISel: Improve private addressing mode matching

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 11 07:38:26 PST 2021


Author: Matt Arsenault
Date: 2021-03-11T10:23:35-05:00
New Revision: 70cb57d7da3108f4ea9cd5bc0d3b08accd109f0e

URL: https://github.com/llvm/llvm-project/commit/70cb57d7da3108f4ea9cd5bc0d3b08accd109f0e
DIFF: https://github.com/llvm/llvm-project/commit/70cb57d7da3108f4ea9cd5bc0d3b08accd109f0e.diff

LOG: AMDGPU/GlobalISel: Improve private addressing mode matching

This enables the look-through-copy to hack around not correctly
regbankselecting constants to match the use bank.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8e831005921c..8c587b5e67f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3712,23 +3712,19 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
   Optional<int> FI;
   Register VAddr = Root.getReg();
   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
-    if (isBaseWithConstantOffset(Root, *MRI)) {
-      const MachineOperand &LHS = RootDef->getOperand(1);
-      const MachineOperand &RHS = RootDef->getOperand(2);
-      const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
-      const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
-      if (LHSDef && RHSDef) {
-        int64_t PossibleOffset =
-            RHSDef->getOperand(1).getCImm()->getSExtValue();
-        if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
-            (!STI.privateMemoryResourceIsRangeChecked() ||
-             KnownBits->signBitIsZero(LHS.getReg()))) {
-          if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
-            FI = LHSDef->getOperand(1).getIndex();
-          else
-            VAddr = LHS.getReg();
-          Offset = PossibleOffset;
-        }
+    Register PtrBase;
+    int64_t ConstOffset;
+    std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
+    if (ConstOffset != 0) {
+      if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
+          (!STI.privateMemoryResourceIsRangeChecked() ||
+           KnownBits->signBitIsZero(PtrBase))) {
+        const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
+        if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
+          FI = PtrBaseDef->getOperand(1).getIndex();
+        else
+          VAddr = PtrBase;
+        Offset = ConstOffset;
       }
     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
       FI = RootDef->getOperand(1).getIndex();

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 3ab8af27c704..abb71f31348d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -11,58 +11,31 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_mov_b32 s6, s33
 ; GCN-NEXT:    s_add_u32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    v_add_co_u32_e32 v12, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:48
-; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    v_mov_b32_e32 v17, s5
-; GCN-NEXT:    v_mov_b32_e32 v16, s4
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
-; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v3, 0x100, v3
-; GCN-NEXT:    v_add_u32_e32 v60, 16, v3
-; GCN-NEXT:    v_add_co_u32_e32 v52, vcc, v0, v16
-; GCN-NEXT:    v_addc_co_u32_e32 v53, vcc, v1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s5
-; GCN-NEXT:    v_mov_b32_e32 v16, s4
-; GCN-NEXT:    v_add_co_u32_e32 v56, vcc, v0, v16
-; GCN-NEXT:    v_addc_co_u32_e32 v57, vcc, v1, v17, vcc
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[3:4], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
+; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
+; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
 ; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
@@ -70,254 +43,220 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
 ; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
 ; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[52:53], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[52:53], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[52:53], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[56:57], off offset:16
-; GCN-NEXT:    v_add_u32_e32 v0, 20, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[56:57], off offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[56:57], off offset:48
-; GCN-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 24, v3
-; GCN-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 28, v3
-; GCN-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 32, v3
-; GCN-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 36, v3
-; GCN-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 40, v3
-; GCN-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 44, v3
-; GCN-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 48, v3
-; GCN-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 52, v3
-; GCN-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 56, v3
-; GCN-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 60, v3
-; GCN-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 64, v3
-; GCN-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x44, v3
-; GCN-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x48, v3
-; GCN-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x4c, v3
-; GCN-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x50, v3
-; GCN-NEXT:    buffer_store_dword v20, v60, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x54, v3
-; GCN-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x58, v3
-; GCN-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x5c, v3
-; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x60, v3
-; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x64, v3
-; GCN-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x68, v3
-; GCN-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x6c, v3
-; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v0, 0x70, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v32
-; GCN-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v13, v33
-; GCN-NEXT:    v_add_u32_e32 v0, 0x74, v3
-; GCN-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v14, v34
-; GCN-NEXT:    v_add_u32_e32 v0, 0x78, v3
-; GCN-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v15, v35
-; GCN-NEXT:    v_add_u32_e32 v0, 0x7c, v3
-; GCN-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x80, v3
-; GCN-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x84, v3
-; GCN-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x88, v3
-; GCN-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x8c, v3
-; GCN-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x90, v3
-; GCN-NEXT:    buffer_store_dword v44, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x94, v3
-; GCN-NEXT:    buffer_store_dword v45, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x98, v3
-; GCN-NEXT:    buffer_store_dword v46, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x9c, v3
-; GCN-NEXT:    buffer_store_dword v47, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xa0, v3
-; GCN-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xa4, v3
-; GCN-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xa8, v3
-; GCN-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xac, v3
-; GCN-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xb0, v3
-; GCN-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xb4, v3
-; GCN-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xb8, v3
-; GCN-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xbc, v3
-; GCN-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xc0, v3
-; GCN-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xc4, v3
-; GCN-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xc8, v3
-; GCN-NEXT:    buffer_store_dword v42, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xcc, v3
-; GCN-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 4, v3
-; GCN-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 8, v3
-; GCN-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 12, v3
-; GCN-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v0, 0xd0, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v4, v8
-; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v5, v9
-; GCN-NEXT:    v_add_u32_e32 v0, 0xd4, v3
-; GCN-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v6, v10
-; GCN-NEXT:    v_add_u32_e32 v0, 0xd8, v3
-; GCN-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v7, v11
-; GCN-NEXT:    v_add_u32_e32 v0, 0xdc, v3
-; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v0, 0xe0, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v12
-; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v9, v13
-; GCN-NEXT:    v_add_u32_e32 v0, 0xe4, v3
-; GCN-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v10, v14
-; GCN-NEXT:    v_add_u32_e32 v0, 0xe8, v3
-; GCN-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v11, v15
-; GCN-NEXT:    v_add_u32_e32 v0, 0xec, v3
-; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xf0, v3
-; GCN-NEXT:    buffer_store_dword v56, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xf4, v3
-; GCN-NEXT:    buffer_store_dword v57, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xf8, v3
-; GCN-NEXT:    buffer_store_dword v58, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xfc, v3
-; GCN-NEXT:    buffer_store_dword v59, v0, s[0:3], 0 offen
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:48
+; GCN-NEXT:    s_movk_i32 s4, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v6, s5
+; GCN-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
+; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
 ; GCN-NEXT:    v_and_b32_e32 v0, 63, v2
+; GCN-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_add_u32_e32 v0, v3, v0
+; GCN-NEXT:    v_add_u32_e32 v1, 0x100, v1
+; GCN-NEXT:    v_add_u32_e32 v0, v1, v0
+; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v20
+; GCN-NEXT:    v_mov_b32_e32 v13, v21
+; GCN-NEXT:    v_mov_b32_e32 v14, v22
+; GCN-NEXT:    v_mov_b32_e32 v15, v23
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:460
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, v11
+; GCN-NEXT:    v_mov_b32_e32 v9, v12
+; GCN-NEXT:    v_mov_b32_e32 v10, v13
+; GCN-NEXT:    v_mov_b32_e32 v11, v14
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v15
+; GCN-NEXT:    v_mov_b32_e32 v13, v16
+; GCN-NEXT:    v_mov_b32_e32 v14, v17
+; GCN-NEXT:    v_mov_b32_e32 v15, v18
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s6
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -333,58 +272,31 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_mov_b32 s6, s33
 ; GCN-NEXT:    s_add_u32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    v_add_co_u32_e32 v12, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:48
-; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    v_mov_b32_e32 v17, s5
-; GCN-NEXT:    v_mov_b32_e32 v16, s4
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
-; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v3, 0x100, v3
-; GCN-NEXT:    v_add_u32_e32 v60, 16, v3
-; GCN-NEXT:    v_add_co_u32_e32 v52, vcc, v0, v16
-; GCN-NEXT:    v_addc_co_u32_e32 v53, vcc, v1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s5
-; GCN-NEXT:    v_mov_b32_e32 v16, s4
-; GCN-NEXT:    v_add_co_u32_e32 v56, vcc, v0, v16
-; GCN-NEXT:    v_addc_co_u32_e32 v57, vcc, v1, v17, vcc
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[3:4], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
+; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
+; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
 ; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
@@ -392,259 +304,225 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
 ; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
 ; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[52:53], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[52:53], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[52:53], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[56:57], off offset:16
-; GCN-NEXT:    v_add_u32_e32 v0, 20, v3
-; GCN-NEXT:    v_add_u32_e32 v1, 0xd0, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[56:57], off offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[56:57], off offset:48
-; GCN-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 24, v3
-; GCN-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 28, v3
-; GCN-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 32, v3
-; GCN-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 36, v3
-; GCN-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 40, v3
-; GCN-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 44, v3
-; GCN-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 48, v3
-; GCN-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 52, v3
-; GCN-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 56, v3
-; GCN-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 60, v3
-; GCN-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 64, v3
-; GCN-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x44, v3
-; GCN-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x48, v3
-; GCN-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x4c, v3
-; GCN-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x50, v3
-; GCN-NEXT:    buffer_store_dword v20, v60, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x54, v3
-; GCN-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x58, v3
-; GCN-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x5c, v3
-; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x60, v3
-; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x64, v3
-; GCN-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x68, v3
-; GCN-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x6c, v3
-; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v0, 0x70, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v32
-; GCN-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v13, v33
-; GCN-NEXT:    v_add_u32_e32 v0, 0x74, v3
-; GCN-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v14, v34
-; GCN-NEXT:    v_add_u32_e32 v0, 0x78, v3
-; GCN-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v15, v35
-; GCN-NEXT:    v_add_u32_e32 v0, 0x7c, v3
-; GCN-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x80, v3
-; GCN-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x84, v3
-; GCN-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x88, v3
-; GCN-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x8c, v3
-; GCN-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x90, v3
-; GCN-NEXT:    buffer_store_dword v44, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x94, v3
-; GCN-NEXT:    buffer_store_dword v45, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x98, v3
-; GCN-NEXT:    buffer_store_dword v46, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x9c, v3
-; GCN-NEXT:    buffer_store_dword v47, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xa0, v3
-; GCN-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xa4, v3
-; GCN-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xa8, v3
-; GCN-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xac, v3
-; GCN-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xb0, v3
-; GCN-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xb4, v3
-; GCN-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xb8, v3
-; GCN-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xbc, v3
-; GCN-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xc0, v3
-; GCN-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xc4, v3
-; GCN-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xc8, v3
-; GCN-NEXT:    buffer_store_dword v42, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xcc, v3
-; GCN-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 4, v3
-; GCN-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 8, v3
-; GCN-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 12, v3
-; GCN-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:48
+; GCN-NEXT:    s_movk_i32 s4, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v6, s5
+; GCN-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
+; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 63, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_add_u32_e32 v0, v3, v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v4, v8
-; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v5, v9
-; GCN-NEXT:    v_add_u32_e32 v1, 0xd4, v3
-; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v6, v10
-; GCN-NEXT:    v_add_u32_e32 v1, 0xd8, v3
-; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v7, v11
-; GCN-NEXT:    v_add_u32_e32 v1, 0xdc, v3
-; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v1, 0xe0, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v12
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v9, v13
-; GCN-NEXT:    v_add_u32_e32 v1, 0xe4, v3
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v10, v14
-; GCN-NEXT:    v_add_u32_e32 v1, 0xe8, v3
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v11, v15
-; GCN-NEXT:    v_add_u32_e32 v1, 0xec, v3
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xf0, v3
-; GCN-NEXT:    buffer_store_dword v56, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xf4, v3
-; GCN-NEXT:    buffer_store_dword v57, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xf8, v3
-; GCN-NEXT:    buffer_store_dword v58, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xfc, v3
-; GCN-NEXT:    buffer_store_dword v59, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v20
+; GCN-NEXT:    v_mov_b32_e32 v13, v21
+; GCN-NEXT:    v_mov_b32_e32 v14, v22
+; GCN-NEXT:    v_mov_b32_e32 v15, v23
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:460
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, v11
+; GCN-NEXT:    v_mov_b32_e32 v9, v12
+; GCN-NEXT:    v_mov_b32_e32 v10, v13
+; GCN-NEXT:    v_mov_b32_e32 v11, v14
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v15
+; GCN-NEXT:    v_mov_b32_e32 v13, v16
+; GCN-NEXT:    v_mov_b32_e32 v14, v17
+; GCN-NEXT:    v_mov_b32_e32 v15, v18
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_lshrrev_b32_e64 v15, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v15, 0x100, v15
+; GCN-NEXT:    v_add_u32_e32 v0, v15, v0
+; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s6
-; GCN-NEXT:    s_waitcnt vmcnt(13)
+; GCN-NEXT:    s_waitcnt vmcnt(14)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -661,61 +539,31 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_add_u32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
 ; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[11:14], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off
 ; GCN-NEXT:    s_movk_i32 s4, 0x80
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[3:4], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
 ; GCN-NEXT:    s_mov_b32 s5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_mov_b32_e32 v4, s5
 ; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v6, s5
 ; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
-; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
-; GCN-NEXT:    v_lshrrev_b32_e64 v62, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v62, 0x100, v62
-; GCN-NEXT:    v_add_u32_e32 v2, 16, v62
-; GCN-NEXT:    s_add_u32 s32, s32, 0x14000
-; GCN-NEXT:    s_sub_u32 s32, s32, 0x14000
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:704 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
 ; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
 ; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
@@ -725,292 +573,218 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:32
 ; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[15:18], v[60:61], off offset:16
-; GCN-NEXT:    v_add_u32_e32 v0, 20, v62
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:708 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:712 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:716 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:720 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:724 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:728 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:732 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:736 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:740 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:744 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:748 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:752 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:756 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:760 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:764 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:768 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[15:18], v[60:61], off offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[15:18], v[60:61], off offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 24, v62
-; GCN-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 28, v62
-; GCN-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 32, v62
-; GCN-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 36, v62
-; GCN-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 40, v62
-; GCN-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 44, v62
-; GCN-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 48, v62
-; GCN-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 52, v62
-; GCN-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 56, v62
-; GCN-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 60, v62
-; GCN-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 64, v62
-; GCN-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x44, v62
-; GCN-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x48, v62
-; GCN-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x4c, v62
-; GCN-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x50, v62
-; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x54, v62
-; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x58, v62
-; GCN-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x5c, v62
-; GCN-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x60, v62
-; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x64, v62
-; GCN-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x68, v62
-; GCN-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x6c, v62
-; GCN-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x70, v62
-; GCN-NEXT:    buffer_store_dword v56, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x74, v62
-; GCN-NEXT:    buffer_store_dword v57, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x78, v62
-; GCN-NEXT:    buffer_store_dword v58, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x7c, v62
-; GCN-NEXT:    buffer_store_dword v59, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x80, v62
-; GCN-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x84, v62
-; GCN-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x88, v62
-; GCN-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x8c, v62
-; GCN-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x90, v62
-; GCN-NEXT:    buffer_store_dword v44, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x94, v62
-; GCN-NEXT:    buffer_store_dword v45, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x98, v62
-; GCN-NEXT:    buffer_store_dword v46, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0x9c, v62
-; GCN-NEXT:    buffer_store_dword v47, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xa0, v62
-; GCN-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xa4, v62
-; GCN-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xa8, v62
-; GCN-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xac, v62
-; GCN-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xb0, v62
-; GCN-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xb4, v62
-; GCN-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xb8, v62
-; GCN-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xbc, v62
-; GCN-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xc0, v62
-; GCN-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xc4, v62
-; GCN-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xc8, v62
-; GCN-NEXT:    buffer_store_dword v42, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xcc, v62
-; GCN-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:704 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v0, 4, v62
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 8, v62
-; GCN-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 12, v62
-; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:708 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:712 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:716 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:720 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:724 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:728 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:732 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:736 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:740 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:744 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:748 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:752 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:756 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:760 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:764 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:768 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v0, 0xd0, v62
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v4, v5
-; GCN-NEXT:    v_mov_b32_e32 v5, v6
-; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xd4, v62
-; GCN-NEXT:    v_mov_b32_e32 v6, v7
-; GCN-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xd8, v62
-; GCN-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v7, v8
-; GCN-NEXT:    v_add_u32_e32 v0, 0xdc, v62
-; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v0, 0xe0, v62
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v9
-; GCN-NEXT:    v_mov_b32_e32 v9, v10
-; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xe4, v62
-; GCN-NEXT:    v_mov_b32_e32 v10, v11
-; GCN-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xe8, v62
-; GCN-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v11, v12
-; GCN-NEXT:    v_add_u32_e32 v0, 0xec, v62
-; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v0, 0xf0, v62
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v13
-; GCN-NEXT:    v_mov_b32_e32 v13, v14
-; GCN-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xf4, v62
-; GCN-NEXT:    v_mov_b32_e32 v14, v15
-; GCN-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v0, 0xf8, v62
-; GCN-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v15, v16
-; GCN-NEXT:    v_add_u32_e32 v0, 0xfc, v62
-; GCN-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 31, v0
+; GCN-NEXT:    s_movk_i32 s4, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v6, s5
+; GCN-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
+; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 31, v2
+; GCN-NEXT:    v_lshrrev_b32_e64 v2, 6, s33
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_add_u32_e32 v0, v62, v0
-; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
-; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v2, 0x100, v2
+; GCN-NEXT:    v_add_u32_e32 v1, v2, v0
+; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[60:61], off offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v20
+; GCN-NEXT:    v_mov_b32_e32 v13, v21
+; GCN-NEXT:    v_mov_b32_e32 v14, v22
+; GCN-NEXT:    v_mov_b32_e32 v15, v23
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:460
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, v11
+; GCN-NEXT:    v_mov_b32_e32 v9, v12
+; GCN-NEXT:    v_mov_b32_e32 v10, v13
+; GCN-NEXT:    v_mov_b32_e32 v11, v14
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v15
+; GCN-NEXT:    v_mov_b32_e32 v13, v16
+; GCN-NEXT:    v_mov_b32_e32 v14, v17
+; GCN-NEXT:    v_mov_b32_e32 v15, v18
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
+; GCN-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s6
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index b28af50de136..40b38c61aadf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -10,15 +10,15 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GCN-NEXT:    v_mov_b32_e32 v16, 0x100
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    v_add_u32_e32 v31, 64, v16
+; GCN-NEXT:    v_mov_b32_e32 v16, 0x100
+; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx16 s[36:51], s[10:11], 0x0
 ; GCN-NEXT:    s_load_dwordx16 s[52:67], s[10:11], 0x40
 ; GCN-NEXT:    s_load_dwordx16 s[12:27], s[10:11], 0x80
-; GCN-NEXT:    v_add_u32_e32 v32, 0x44, v16
-; GCN-NEXT:    v_add_u32_e32 v33, 0x48, v16
+; GCN-NEXT:    s_and_b32 s4, s7, 63
+; GCN-NEXT:    s_lshl_b32 s4, s4, 2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NEXT:    v_mov_b32_e32 v1, s37
@@ -38,328 +38,217 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_mov_b32_e32 v15, s51
 ; GCN-NEXT:    s_load_dwordx16 s[36:51], s[10:11], 0xc0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:256
-; GCN-NEXT:    v_add_u32_e32 v0, 4, v16
-; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, s52
-; GCN-NEXT:    buffer_store_dword v1, v31, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, s53
-; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, s54
-; GCN-NEXT:    buffer_store_dword v1, v33, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v34, 0x4c, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s55
-; GCN-NEXT:    buffer_store_dword v1, v34, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v35, 0x50, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s56
-; GCN-NEXT:    buffer_store_dword v1, v35, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v36, 0x54, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s57
-; GCN-NEXT:    buffer_store_dword v1, v36, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v37, 0x58, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s58
-; GCN-NEXT:    buffer_store_dword v1, v37, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v38, 0x5c, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s59
-; GCN-NEXT:    buffer_store_dword v1, v38, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v39, 0x60, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s60
-; GCN-NEXT:    buffer_store_dword v1, v39, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v40, 0x64, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s61
-; GCN-NEXT:    buffer_store_dword v1, v40, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v41, 0x68, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s62
-; GCN-NEXT:    buffer_store_dword v1, v41, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v42, 0x6c, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s63
-; GCN-NEXT:    buffer_store_dword v1, v42, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v43, 0x70, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s64
-; GCN-NEXT:    buffer_store_dword v1, v43, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v44, 0x74, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s65
-; GCN-NEXT:    buffer_store_dword v1, v44, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v45, 0x78, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s66
-; GCN-NEXT:    buffer_store_dword v1, v45, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v46, 0x7c, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s67
-; GCN-NEXT:    buffer_store_dword v1, v46, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v47, 0x80, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-NEXT:    buffer_store_dword v1, v47, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v48, 0x84, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-NEXT:    buffer_store_dword v1, v48, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v49, 0x88, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-NEXT:    buffer_store_dword v1, v49, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v50, 0x8c, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s15
-; GCN-NEXT:    buffer_store_dword v1, v50, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v51, 0x90, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s16
-; GCN-NEXT:    buffer_store_dword v1, v51, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v52, 0x94, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-NEXT:    buffer_store_dword v1, v52, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v53, 0x98, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s18
-; GCN-NEXT:    buffer_store_dword v1, v53, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v54, 0x9c, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NEXT:    buffer_store_dword v1, v54, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v55, 0xa0, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s20
-; GCN-NEXT:    buffer_store_dword v1, v55, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v56, 0xa4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-NEXT:    buffer_store_dword v1, v56, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v57, 0xa8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s22
-; GCN-NEXT:    buffer_store_dword v1, v57, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v58, 0xac, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-NEXT:    buffer_store_dword v1, v58, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v59, 0xb0, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s24
-; GCN-NEXT:    buffer_store_dword v1, v59, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v60, 0xb4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-NEXT:    buffer_store_dword v1, v60, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v61, 0xb8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s26
-; GCN-NEXT:    buffer_store_dword v1, v61, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v62, 0xbc, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-NEXT:    buffer_store_dword v1, v62, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v63, 0xc0, v16
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:260
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:264
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:268
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:272
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:276
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:280
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:288
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:292
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:296
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:300
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:304
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:308
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:312
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:316
+; GCN-NEXT:    v_mov_b32_e32 v0, s52
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:320
+; GCN-NEXT:    v_mov_b32_e32 v0, s53
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:324
+; GCN-NEXT:    v_mov_b32_e32 v0, s54
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:328
+; GCN-NEXT:    v_mov_b32_e32 v0, s55
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:332
+; GCN-NEXT:    v_mov_b32_e32 v0, s56
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:336
+; GCN-NEXT:    v_mov_b32_e32 v0, s57
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:340
+; GCN-NEXT:    v_mov_b32_e32 v0, s58
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:344
+; GCN-NEXT:    v_mov_b32_e32 v0, s59
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:348
+; GCN-NEXT:    v_mov_b32_e32 v0, s60
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:352
+; GCN-NEXT:    v_mov_b32_e32 v0, s61
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:356
+; GCN-NEXT:    v_mov_b32_e32 v0, s62
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:360
+; GCN-NEXT:    v_mov_b32_e32 v0, s63
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:364
+; GCN-NEXT:    v_mov_b32_e32 v0, s64
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:368
+; GCN-NEXT:    v_mov_b32_e32 v0, s65
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:372
+; GCN-NEXT:    v_mov_b32_e32 v0, s66
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:376
+; GCN-NEXT:    v_mov_b32_e32 v0, s67
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:380
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:384
+; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:388
+; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:392
+; GCN-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:396
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:400
+; GCN-NEXT:    v_mov_b32_e32 v0, s17
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:404
+; GCN-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:408
+; GCN-NEXT:    v_mov_b32_e32 v0, s19
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:412
+; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:416
+; GCN-NEXT:    v_mov_b32_e32 v0, s21
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:420
+; GCN-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:424
+; GCN-NEXT:    v_mov_b32_e32 v0, s23
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:428
+; GCN-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:432
+; GCN-NEXT:    v_mov_b32_e32 v0, s25
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:436
+; GCN-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:440
+; GCN-NEXT:    v_mov_b32_e32 v0, s27
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:444
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, s36
-; GCN-NEXT:    buffer_store_dword v1, v63, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v64, 0xc4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s37
-; GCN-NEXT:    buffer_store_dword v1, v64, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v65, 0xc8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s38
-; GCN-NEXT:    buffer_store_dword v1, v65, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v66, 0xcc, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s39
-; GCN-NEXT:    buffer_store_dword v1, v66, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v67, 0xd0, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s40
-; GCN-NEXT:    buffer_store_dword v1, v67, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v68, 0xd4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s41
-; GCN-NEXT:    buffer_store_dword v1, v68, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v69, 0xd8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s42
-; GCN-NEXT:    buffer_store_dword v1, v69, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v70, 0xdc, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s43
-; GCN-NEXT:    buffer_store_dword v1, v70, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v71, 0xe0, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s44
-; GCN-NEXT:    buffer_store_dword v1, v71, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v72, 0xe4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s45
-; GCN-NEXT:    buffer_store_dword v1, v72, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v73, 0xe8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s46
-; GCN-NEXT:    buffer_store_dword v1, v73, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v74, 0xec, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s47
-; GCN-NEXT:    buffer_store_dword v1, v74, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v75, 0xf0, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s48
-; GCN-NEXT:    buffer_store_dword v1, v75, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v76, 0xf4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s49
-; GCN-NEXT:    s_and_b32 s4, s7, 63
-; GCN-NEXT:    buffer_store_dword v1, v76, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v77, 0xf8, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s50
-; GCN-NEXT:    v_add_u32_e32 v17, 8, v16
-; GCN-NEXT:    buffer_store_dword v1, v77, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v78, 0xfc, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s51
-; GCN-NEXT:    s_lshl_b32 s4, s4, 2
-; GCN-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v18, 12, v16
-; GCN-NEXT:    v_add_u32_e32 v19, 16, v16
-; GCN-NEXT:    v_add_u32_e32 v20, 20, v16
-; GCN-NEXT:    v_add_u32_e32 v21, 24, v16
-; GCN-NEXT:    v_add_u32_e32 v22, 28, v16
-; GCN-NEXT:    v_add_u32_e32 v23, 32, v16
-; GCN-NEXT:    v_add_u32_e32 v24, 36, v16
-; GCN-NEXT:    v_add_u32_e32 v25, 40, v16
-; GCN-NEXT:    v_add_u32_e32 v26, 44, v16
-; GCN-NEXT:    v_add_u32_e32 v27, 48, v16
-; GCN-NEXT:    v_add_u32_e32 v28, 52, v16
-; GCN-NEXT:    v_add_u32_e32 v29, 56, v16
-; GCN-NEXT:    v_add_u32_e32 v30, 60, v16
-; GCN-NEXT:    buffer_store_dword v1, v78, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, s4, v16
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    buffer_store_dword v3, v18, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v4, v19, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v5, v20, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v6, v21, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v7, v22, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v8, v23, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v9, v24, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v10, v25, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v11, v26, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v12, v27, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v13, v28, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v14, v29, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v15, v30, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    buffer_load_dword v2, v17, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v3, v18, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v4, v19, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v5, v20, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v6, v21, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, v22, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v8, v23, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v9, v24, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v10, v25, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v11, v26, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v12, v27, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v13, v28, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v14, v29, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v15, v30, s[0:3], 0 offen
-; GCN-NEXT:    ; kill: killed $vgpr30
-; GCN-NEXT:    ; kill: killed $vgpr19
-; GCN-NEXT:    ; kill: killed $vgpr23
-; GCN-NEXT:    ; kill: killed $vgpr27
-; GCN-NEXT:    ; kill: killed $vgpr20
-; GCN-NEXT:    ; kill: killed $vgpr24
-; GCN-NEXT:    ; kill: killed $vgpr28
-; GCN-NEXT:    ; kill: killed $vgpr17
-; GCN-NEXT:    ; kill: killed $vgpr21
-; GCN-NEXT:    ; kill: killed $vgpr25
-; GCN-NEXT:    ; kill: killed $vgpr0
-; GCN-NEXT:    ; kill: killed $vgpr29
-; GCN-NEXT:    ; kill: killed $vgpr18
-; GCN-NEXT:    ; kill: killed $vgpr22
-; GCN-NEXT:    ; kill: killed $vgpr26
-; GCN-NEXT:    buffer_load_dword v16, v31, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v17, v32, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v18, v33, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v19, v34, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v20, v35, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v21, v36, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v22, v37, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v23, v38, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v24, v39, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v25, v40, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v26, v41, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v27, v42, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v28, v43, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v29, v44, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v30, v45, s[0:3], 0 offen
-; GCN-NEXT:    ; kill: killed $vgpr45
-; GCN-NEXT:    ; kill: killed $vgpr34
-; GCN-NEXT:    ; kill: killed $vgpr38
-; GCN-NEXT:    ; kill: killed $vgpr42
-; GCN-NEXT:    ; kill: killed $vgpr31
-; GCN-NEXT:    ; kill: killed $vgpr35
-; GCN-NEXT:    ; kill: killed $vgpr39
-; GCN-NEXT:    ; kill: killed $vgpr43
-; GCN-NEXT:    ; kill: killed $vgpr32
-; GCN-NEXT:    ; kill: killed $vgpr36
-; GCN-NEXT:    ; kill: killed $vgpr40
-; GCN-NEXT:    ; kill: killed $vgpr44
-; GCN-NEXT:    ; kill: killed $vgpr33
-; GCN-NEXT:    ; kill: killed $vgpr37
-; GCN-NEXT:    ; kill: killed $vgpr41
-; GCN-NEXT:    buffer_load_dword v31, v46, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v32, v47, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v33, v48, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v34, v49, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v35, v50, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v36, v51, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v37, v52, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v38, v53, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v39, v54, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v40, v55, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v41, v56, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v42, v57, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v43, v58, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v44, v59, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v45, v60, s[0:3], 0 offen
-; GCN-NEXT:    ; kill: killed $vgpr60
-; GCN-NEXT:    ; kill: killed $vgpr49
-; GCN-NEXT:    ; kill: killed $vgpr53
-; GCN-NEXT:    ; kill: killed $vgpr57
-; GCN-NEXT:    ; kill: killed $vgpr46
-; GCN-NEXT:    ; kill: killed $vgpr50
-; GCN-NEXT:    ; kill: killed $vgpr54
-; GCN-NEXT:    ; kill: killed $vgpr58
-; GCN-NEXT:    ; kill: killed $vgpr47
-; GCN-NEXT:    ; kill: killed $vgpr51
-; GCN-NEXT:    ; kill: killed $vgpr55
-; GCN-NEXT:    ; kill: killed $vgpr59
-; GCN-NEXT:    ; kill: killed $vgpr48
-; GCN-NEXT:    ; kill: killed $vgpr52
-; GCN-NEXT:    ; kill: killed $vgpr56
-; GCN-NEXT:    buffer_load_dword v46, v61, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v47, v62, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v48, v63, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v49, v64, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v50, v65, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v51, v66, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v52, v67, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v53, v68, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v54, v69, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v55, v70, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v56, v71, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v57, v72, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v58, v73, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v59, v74, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v60, v75, s[0:3], 0 offen
-; GCN-NEXT:    ; kill: killed $vgpr64
-; GCN-NEXT:    ; kill: killed $vgpr68
-; GCN-NEXT:    ; kill: killed $vgpr72
-; GCN-NEXT:    ; kill: killed $vgpr61
-; GCN-NEXT:    ; kill: killed $vgpr65
-; GCN-NEXT:    ; kill: killed $vgpr69
-; GCN-NEXT:    ; kill: killed $vgpr73
-; GCN-NEXT:    ; kill: killed $vgpr62
-; GCN-NEXT:    ; kill: killed $vgpr66
-; GCN-NEXT:    ; kill: killed $vgpr70
-; GCN-NEXT:    ; kill: killed $vgpr74
-; GCN-NEXT:    ; kill: killed $vgpr63
-; GCN-NEXT:    ; kill: killed $vgpr67
-; GCN-NEXT:    ; kill: killed $vgpr71
-; GCN-NEXT:    ; kill: killed $vgpr75
-; GCN-NEXT:    buffer_load_dword v61, v76, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v62, v77, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v63, v78, s[0:3], 0 offen
-; GCN-NEXT:    ; kill: killed $vgpr76
-; GCN-NEXT:    ; kill: killed $vgpr77
-; GCN-NEXT:    ; kill: killed $vgpr78
+; GCN-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:448
+; GCN-NEXT:    v_mov_b32_e32 v0, s37
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:452
+; GCN-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:456
+; GCN-NEXT:    v_mov_b32_e32 v0, s39
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:460
+; GCN-NEXT:    v_mov_b32_e32 v0, s40
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:464
+; GCN-NEXT:    v_mov_b32_e32 v0, s41
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:468
+; GCN-NEXT:    v_mov_b32_e32 v0, s42
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:472
+; GCN-NEXT:    v_mov_b32_e32 v0, s43
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:476
+; GCN-NEXT:    v_mov_b32_e32 v0, s44
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:480
+; GCN-NEXT:    v_mov_b32_e32 v0, s45
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:484
+; GCN-NEXT:    v_mov_b32_e32 v0, s46
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:488
+; GCN-NEXT:    v_mov_b32_e32 v0, s47
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:492
+; GCN-NEXT:    v_mov_b32_e32 v0, s48
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:496
+; GCN-NEXT:    v_mov_b32_e32 v0, s49
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:500
+; GCN-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:504
+; GCN-NEXT:    v_mov_b32_e32 v0, s51
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:508
+; GCN-NEXT:    v_add_u32_e32 v0, s4, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:256
-; GCN-NEXT:    v_mov_b32_e32 v64, 0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:260
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:264
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:268
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:272
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:276
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:280
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:284
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:288
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:292
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:296
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:300
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:304
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:308
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:312
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:316
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], 0 offset:320
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], 0 offset:324
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], 0 offset:328
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], 0 offset:332
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], 0 offset:336
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], 0 offset:340
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], 0 offset:344
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], 0 offset:348
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], 0 offset:352
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], 0 offset:356
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], 0 offset:360
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], 0 offset:364
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], 0 offset:368
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], 0 offset:372
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], 0 offset:376
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], 0 offset:380
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], 0 offset:384
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], 0 offset:388
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], 0 offset:392
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], 0 offset:396
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], 0 offset:400
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], 0 offset:404
+; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], 0 offset:408
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], 0 offset:412
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], 0 offset:416
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], 0 offset:420
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], 0 offset:424
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], 0 offset:428
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], 0 offset:432
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], 0 offset:436
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], 0 offset:440
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], 0 offset:444
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], 0 offset:448
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], 0 offset:452
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], 0 offset:456
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], 0 offset:460
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], 0 offset:464
+; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], 0 offset:468
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], 0 offset:472
+; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], 0 offset:476
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], 0 offset:480
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], 0 offset:484
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], 0 offset:488
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], 0 offset:492
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], 0 offset:496
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], 0 offset:500
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], 0 offset:504
+; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], 0 offset:508
+; GCN-NEXT:    s_waitcnt vmcnt(60)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[8:9]
+; GCN-NEXT:    s_waitcnt vmcnt(57)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[4:7], s[8:9] offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(54)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[8:11], s[8:9] offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(51)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[12:15], s[8:9] offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(48)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[8:9] offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(45)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[8:9] offset:80
+; GCN-NEXT:    s_waitcnt vmcnt(42)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[8:9] offset:96
+; GCN-NEXT:    s_waitcnt vmcnt(39)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[8:9] offset:112
+; GCN-NEXT:    s_waitcnt vmcnt(36)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[32:35], s[8:9] offset:128
+; GCN-NEXT:    s_waitcnt vmcnt(33)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[36:39], s[8:9] offset:144
+; GCN-NEXT:    s_waitcnt vmcnt(30)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[40:43], s[8:9] offset:160
+; GCN-NEXT:    s_waitcnt vmcnt(27)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[44:47], s[8:9] offset:176
+; GCN-NEXT:    s_waitcnt vmcnt(24)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[48:51], s[8:9] offset:192
+; GCN-NEXT:    s_waitcnt vmcnt(21)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[52:55], s[8:9] offset:208
+; GCN-NEXT:    s_waitcnt vmcnt(18)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[56:59], s[8:9] offset:224
+; GCN-NEXT:    s_waitcnt vmcnt(15)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[60:63], s[8:9] offset:240
 ; GCN-NEXT:    s_endpgm
   %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
index 77bbf8375f2f..f0411d4233f3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
@@ -782,7 +782,6 @@ body: |
 ...
 
 ---
-
 name: load_private_s32_from_1_fi_offset_4095
 legalized:       true
 regBankSelected: true
@@ -810,6 +809,36 @@ body: |
 
 ...
 
+# Have to hack around the copy of the constant to VGPR
+---
+name: load_private_s32_from_1_fi_offset_sgpr_4095
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+
+body: |
+  bb.0:
+
+    ; GFX6-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095
+    ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    %0:vgpr(p5) = G_FRAME_INDEX %stack.0
+    %1:sgpr(s32) = G_CONSTANT i32 4095
+    %2:vgpr(s32) = COPY %1
+    %3:vgpr(p5) = G_PTR_ADD %0, %2
+    %4:vgpr(s32) = G_LOAD %3 :: (load 1, align 1, addrspace 5)
+    $vgpr0 = COPY %4
+
+...
+
 ---
 
 name: load_private_s32_from_1_fi_offset_4096

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index 3eeea3cc79e4..f573bae21e87 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -32,16 +32,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GCN-NEXT:    s_load_dword s8, s[4:5], 0x10
 ; GCN-NEXT:    s_add_u32 s4, s32, 0x1000
-; GCN-NEXT:    s_add_u32 s5, s4, 4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s5, s8, 2
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v2, 1
 ; GCN-NEXT:    s_add_u32 s4, s4, s5
-; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -103,16 +101,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_load_dword s8, s[4:5], 0xc
 ; GCN-NEXT:    s_add_u32 s4, s32, 0x1000
 ; GCN-NEXT:    s_and_b32 s4, s4, 0xfffff000
-; GCN-NEXT:    s_add_u32 s5, s4, 4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s5, s8, 2
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v2, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, 1
 ; GCN-NEXT:    s_add_u32 s4, s4, s5
-; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -156,7 +152,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
 ; GCN-LABEL: func_non_entry_block_static_alloca_align4:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s7, s33
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
@@ -170,11 +166,9 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
 ; GCN-NEXT:    s_add_u32 s6, s32, 0x1000
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s6
-; GCN-NEXT:    s_add_u32 s7, s6, 4
 ; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v2, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v4
 ; GCN-NEXT:    v_add_u32_e32 v2, s6, v2
 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
@@ -188,7 +182,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s33, s7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 
 entry:
@@ -222,7 +216,7 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
 ; GCN-LABEL: func_non_entry_block_static_alloca_align64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s7, s33
 ; GCN-NEXT:    s_add_u32 s33, s32, 0xfc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xfffff000
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
@@ -232,13 +226,11 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
 ; GCN-NEXT:  ; %bb.1: ; %bb.0
 ; GCN-NEXT:    s_add_u32 s6, s32, 0x1000
 ; GCN-NEXT:    s_and_b32 s6, s6, 0xfffff000
-; GCN-NEXT:    s_add_u32 s7, s6, 4
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
 ; GCN-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v2, 1
-; GCN-NEXT:    v_mov_b32_e32 v4, s7
-; GCN-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v3
 ; GCN-NEXT:    v_add_u32_e32 v2, s6, v2
 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
@@ -252,7 +244,7 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x2000
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s33, s7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %arg.cond, 0


        


More information about the llvm-commits mailing list