[llvm-branch-commits] [llvm] 221fded - [AMDGPU][GlobalISel] Fold flat vgpr + constant addresses

Sebastian Neubauer via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Dec 23 02:08:43 PST 2020


Author: Sebastian Neubauer
Date: 2020-12-23T10:40:30+01:00
New Revision: 221fdedc692672d4f63ee768ae5c541626734240

URL: https://github.com/llvm/llvm-project/commit/221fdedc692672d4f63ee768ae5c541626734240
DIFF: https://github.com/llvm/llvm-project/commit/221fdedc692672d4f63ee768ae5c541626734240.diff

LOG: [AMDGPU][GlobalISel] Fold flat vgpr + constant addresses

Use getPtrBaseWithConstantOffset in selectFlatOffsetImpl to fold more
vgpr+constant addresses.

Differential Revision: https://reviews.llvm.org/D93692

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ac6ddbae350b..bfac1b412051 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3427,22 +3427,18 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
   if (!STI.hasFlatInstOffsets())
     return Default;
 
-  const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
-  if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
-    return Default;
-
-  Optional<int64_t> Offset =
-      getConstantVRegSExtVal(OpDef->getOperand(2).getReg(), *MRI);
-  if (!Offset.hasValue())
+  Register PtrBase;
+  int64_t ConstOffset;
+  std::tie(PtrBase, ConstOffset) =
+      getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+  if (ConstOffset == 0)
     return Default;
 
   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
-  if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
+  if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed))
     return Default;
 
-  Register BasePtr = OpDef->getOperand(1).getReg();
-
-  return std::make_pair(BasePtr, Offset.getValue());
+  return std::make_pair(PtrBase, ConstOffset);
 }
 
 InstructionSelector::ComplexRendererFns

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 6fd99b8406d7..f0eefbd44bfb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -8,279 +8,316 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v64i32_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v15, v0
 ; GCN-NEXT:    s_add_u32 s4, s32, 0x3fc0
-; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_add_co_u32_e32 v12, vcc, 64, v0
 ; GCN-NEXT:    s_mov_b32 s6, s33
 ; GCN-NEXT:    s_and_b32 s33, s4, 0xffffc000
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:48
+; GCN-NEXT:    s_mov_b32 s5, 0
 ; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    v_mov_b32_e32 v12, s5
-; GCN-NEXT:    v_mov_b32_e32 v16, v1
-; GCN-NEXT:    v_add_co_u32_e32 v31, vcc, 64, v15
-; GCN-NEXT:    v_mov_b32_e32 v11, s4
-; GCN-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v16, vcc
-; GCN-NEXT:    v_add_co_u32_e32 v48, vcc, v15, v11
-; GCN-NEXT:    v_addc_co_u32_e32 v49, vcc, v16, v12, vcc
+; GCN-NEXT:    v_mov_b32_e32 v17, s5
+; GCN-NEXT:    v_mov_b32_e32 v16, s4
 ; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v12, s5
-; GCN-NEXT:    v_mov_b32_e32 v11, s4
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_add_co_u32_e32 v59, vcc, v15, v11
-; GCN-NEXT:    global_load_dwordx4 v[3:6], v[15:16], off
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[15:16], off offset:16
-; GCN-NEXT:    v_addc_co_u32_e32 v60, vcc, v16, v12, vcc
-; GCN-NEXT:    global_load_dwordx4 v[11:14], v[15:16], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[15:18], v[15:16], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[19:22], v[31:32], off
-; GCN-NEXT:    global_load_dwordx4 v[23:26], v[31:32], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[27:30], v[31:32], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[31:34], v[31:32], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[35:38], v[48:49], off
-; GCN-NEXT:    global_load_dwordx4 v[39:42], v[48:49], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[43:46], v[48:49], off offset:32
-; GCN-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v0, 0x100, v0
-; GCN-NEXT:    v_add_u32_e32 v1, 16, v0
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[47:50], v[48:49], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[43:46], v[59:60], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[51:54], v[59:60], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[55:58], v[59:60], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[59:62], v[59:60], off offset:48
-; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 20, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 24, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 28, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 32, v0
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 36, v0
-; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 40, v0
-; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 44, v0
-; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 48, v0
-; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 52, v0
-; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 56, v0
-; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 60, v0
-; GCN-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 64, v0
-; GCN-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x44, v0
-; GCN-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x48, v0
-; GCN-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x4c, v0
-; GCN-NEXT:    buffer_store_dword v22, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x50, v0
-; GCN-NEXT:    buffer_store_dword v23, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x54, v0
-; GCN-NEXT:    buffer_store_dword v24, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x58, v0
-; GCN-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x5c, v0
-; GCN-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x60, v0
-; GCN-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x64, v0
-; GCN-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x68, v0
-; GCN-NEXT:    buffer_store_dword v29, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6c, v0
-; GCN-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x70, v0
-; GCN-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x74, v0
-; GCN-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x78, v0
-; GCN-NEXT:    buffer_store_dword v33, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x7c, v0
-; GCN-NEXT:    buffer_store_dword v34, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x80, v0
-; GCN-NEXT:    buffer_store_dword v35, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x84, v0
-; GCN-NEXT:    buffer_store_dword v36, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x88, v0
-; GCN-NEXT:    buffer_store_dword v37, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x8c, v0
-; GCN-NEXT:    buffer_store_dword v38, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x90, v0
-; GCN-NEXT:    buffer_store_dword v39, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x94, v0
-; GCN-NEXT:    buffer_store_dword v40, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x98, v0
-; GCN-NEXT:    buffer_store_dword v41, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x9c, v0
-; GCN-NEXT:    buffer_store_dword v42, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa0, v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v15
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v9, v16
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa4, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v10, v17
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa8, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v11, v18
-; GCN-NEXT:    v_add_u32_e32 v1, 0xac, v0
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb0, v0
-; GCN-NEXT:    buffer_store_dword v47, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb4, v0
-; GCN-NEXT:    buffer_store_dword v48, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb8, v0
-; GCN-NEXT:    buffer_store_dword v49, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xbc, v0
-; GCN-NEXT:    buffer_store_dword v50, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc0, v0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc4, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc8, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xcc, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
-; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 8, v0
-; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 12, v0
-; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
-; GCN-NEXT:    v_add_u32_e32 v1, 0xd0, v0
-; GCN-NEXT:    buffer_store_dword v51, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xd4, v0
-; GCN-NEXT:    buffer_store_dword v52, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xd8, v0
-; GCN-NEXT:    buffer_store_dword v53, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xdc, v0
-; GCN-NEXT:    buffer_store_dword v54, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xe0, v0
-; GCN-NEXT:    buffer_store_dword v55, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xe4, v0
-; GCN-NEXT:    buffer_store_dword v56, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xe8, v0
-; GCN-NEXT:    buffer_store_dword v57, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xec, v0
-; GCN-NEXT:    buffer_store_dword v58, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xf0, v0
-; GCN-NEXT:    buffer_store_dword v59, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xf4, v0
-; GCN-NEXT:    buffer_store_dword v60, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xf8, v0
-; GCN-NEXT:    buffer_store_dword v61, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xfc, v0
-; GCN-NEXT:    buffer_store_dword v62, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_and_b32_e32 v1, 63, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GCN-NEXT:    v_add_u32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v3, 0x100, v3
+; GCN-NEXT:    v_add_u32_e32 v60, 16, v3
+; GCN-NEXT:    v_add_co_u32_e32 v52, vcc, v0, v16
+; GCN-NEXT:    v_addc_co_u32_e32 v53, vcc, v1, v17, vcc
+; GCN-NEXT:    v_mov_b32_e32 v17, s5
+; GCN-NEXT:    v_mov_b32_e32 v16, s4
+; GCN-NEXT:    v_add_co_u32_e32 v56, vcc, v0, v16
+; GCN-NEXT:    v_addc_co_u32_e32 v57, vcc, v1, v17, vcc
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[52:53], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[52:53], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[52:53], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[56:57], off offset:16
+; GCN-NEXT:    v_add_u32_e32 v0, 20, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[56:57], off offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[56:57], off offset:48
+; GCN-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 24, v3
+; GCN-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 28, v3
+; GCN-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 32, v3
+; GCN-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 36, v3
+; GCN-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 40, v3
+; GCN-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 44, v3
+; GCN-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 48, v3
+; GCN-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 52, v3
+; GCN-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 56, v3
+; GCN-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 60, v3
+; GCN-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 64, v3
+; GCN-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x44, v3
+; GCN-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x48, v3
+; GCN-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x4c, v3
+; GCN-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x50, v3
+; GCN-NEXT:    buffer_store_dword v20, v60, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x54, v3
+; GCN-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x58, v3
+; GCN-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x5c, v3
+; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x60, v3
+; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x64, v3
+; GCN-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x68, v3
+; GCN-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x6c, v3
+; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v0, 0x70, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v32
+; GCN-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v13, v33
+; GCN-NEXT:    v_add_u32_e32 v0, 0x74, v3
+; GCN-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v14, v34
+; GCN-NEXT:    v_add_u32_e32 v0, 0x78, v3
+; GCN-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v15, v35
+; GCN-NEXT:    v_add_u32_e32 v0, 0x7c, v3
+; GCN-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x80, v3
+; GCN-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x84, v3
+; GCN-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x88, v3
+; GCN-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x8c, v3
+; GCN-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x90, v3
+; GCN-NEXT:    buffer_store_dword v44, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x94, v3
+; GCN-NEXT:    buffer_store_dword v45, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x98, v3
+; GCN-NEXT:    buffer_store_dword v46, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x9c, v3
+; GCN-NEXT:    buffer_store_dword v47, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xa0, v3
+; GCN-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xa4, v3
+; GCN-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xa8, v3
+; GCN-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xac, v3
+; GCN-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xb0, v3
+; GCN-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xb4, v3
+; GCN-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xb8, v3
+; GCN-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xbc, v3
+; GCN-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xc0, v3
+; GCN-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xc4, v3
+; GCN-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xc8, v3
+; GCN-NEXT:    buffer_store_dword v42, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xcc, v3
+; GCN-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 4, v3
+; GCN-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 8, v3
+; GCN-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 12, v3
+; GCN-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v0, 0xd0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-NEXT:    v_add_u32_e32 v0, 0xd4, v3
+; GCN-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v6, v10
+; GCN-NEXT:    v_add_u32_e32 v0, 0xd8, v3
+; GCN-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v7, v11
+; GCN-NEXT:    v_add_u32_e32 v0, 0xdc, v3
+; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v0, 0xe0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, v12
+; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v9, v13
+; GCN-NEXT:    v_add_u32_e32 v0, 0xe4, v3
+; GCN-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v10, v14
+; GCN-NEXT:    v_add_u32_e32 v0, 0xe8, v3
+; GCN-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v11, v15
+; GCN-NEXT:    v_add_u32_e32 v0, 0xec, v3
+; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xf0, v3
+; GCN-NEXT:    buffer_store_dword v56, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xf4, v3
+; GCN-NEXT:    buffer_store_dword v57, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xf8, v3
+; GCN-NEXT:    buffer_store_dword v58, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xfc, v3
+; GCN-NEXT:    buffer_store_dword v59, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_and_b32_e32 v0, 63, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_add_u32_e32 v0, v3, v0
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s6
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -293,284 +330,321 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v128i16_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v15, v0
 ; GCN-NEXT:    s_add_u32 s4, s32, 0x3fc0
-; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_add_co_u32_e32 v12, vcc, 64, v0
 ; GCN-NEXT:    s_mov_b32 s6, s33
 ; GCN-NEXT:    s_and_b32 s33, s4, 0xffffc000
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:48
+; GCN-NEXT:    s_mov_b32 s5, 0
 ; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    v_mov_b32_e32 v12, s5
-; GCN-NEXT:    v_mov_b32_e32 v16, v1
-; GCN-NEXT:    v_add_co_u32_e32 v31, vcc, 64, v15
-; GCN-NEXT:    v_mov_b32_e32 v11, s4
-; GCN-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v16, vcc
-; GCN-NEXT:    v_add_co_u32_e32 v48, vcc, v15, v11
-; GCN-NEXT:    v_addc_co_u32_e32 v49, vcc, v16, v12, vcc
+; GCN-NEXT:    v_mov_b32_e32 v17, s5
+; GCN-NEXT:    v_mov_b32_e32 v16, s4
 ; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v12, s5
-; GCN-NEXT:    v_mov_b32_e32 v11, s4
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_add_co_u32_e32 v59, vcc, v15, v11
-; GCN-NEXT:    global_load_dwordx4 v[3:6], v[15:16], off
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[15:16], off offset:16
-; GCN-NEXT:    v_addc_co_u32_e32 v60, vcc, v16, v12, vcc
-; GCN-NEXT:    global_load_dwordx4 v[11:14], v[15:16], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[15:18], v[15:16], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[19:22], v[31:32], off
-; GCN-NEXT:    global_load_dwordx4 v[23:26], v[31:32], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[27:30], v[31:32], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[31:34], v[31:32], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[35:38], v[48:49], off
-; GCN-NEXT:    global_load_dwordx4 v[39:42], v[48:49], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[43:46], v[48:49], off offset:32
-; GCN-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v0, 0x100, v0
-; GCN-NEXT:    v_add_u32_e32 v1, 16, v0
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[47:50], v[48:49], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[43:46], v[59:60], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[51:54], v[59:60], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[55:58], v[59:60], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[59:62], v[59:60], off offset:48
-; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 20, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 24, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 28, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 32, v0
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 36, v0
-; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 40, v0
-; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 44, v0
-; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 48, v0
-; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 52, v0
-; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 56, v0
-; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 60, v0
-; GCN-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 64, v0
-; GCN-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x44, v0
-; GCN-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x48, v0
-; GCN-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x4c, v0
-; GCN-NEXT:    buffer_store_dword v22, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x50, v0
-; GCN-NEXT:    buffer_store_dword v23, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x54, v0
-; GCN-NEXT:    buffer_store_dword v24, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x58, v0
-; GCN-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x5c, v0
-; GCN-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x60, v0
-; GCN-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x64, v0
-; GCN-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x68, v0
-; GCN-NEXT:    buffer_store_dword v29, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6c, v0
-; GCN-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x70, v0
-; GCN-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x74, v0
-; GCN-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x78, v0
-; GCN-NEXT:    buffer_store_dword v33, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x7c, v0
-; GCN-NEXT:    buffer_store_dword v34, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x80, v0
-; GCN-NEXT:    buffer_store_dword v35, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x84, v0
-; GCN-NEXT:    buffer_store_dword v36, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x88, v0
-; GCN-NEXT:    buffer_store_dword v37, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x8c, v0
-; GCN-NEXT:    buffer_store_dword v38, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x90, v0
-; GCN-NEXT:    buffer_store_dword v39, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x94, v0
-; GCN-NEXT:    buffer_store_dword v40, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x98, v0
-; GCN-NEXT:    buffer_store_dword v41, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x9c, v0
-; GCN-NEXT:    buffer_store_dword v42, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa0, v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v15
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v9, v16
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa4, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v10, v17
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa8, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v11, v18
-; GCN-NEXT:    v_add_u32_e32 v1, 0xac, v0
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb0, v0
-; GCN-NEXT:    buffer_store_dword v47, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb4, v0
-; GCN-NEXT:    buffer_store_dword v48, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb8, v0
-; GCN-NEXT:    buffer_store_dword v49, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xbc, v0
-; GCN-NEXT:    buffer_store_dword v50, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc0, v0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v3, 0x100, v3
+; GCN-NEXT:    v_add_u32_e32 v60, 16, v3
+; GCN-NEXT:    v_add_co_u32_e32 v52, vcc, v0, v16
+; GCN-NEXT:    v_addc_co_u32_e32 v53, vcc, v1, v17, vcc
+; GCN-NEXT:    v_mov_b32_e32 v17, s5
+; GCN-NEXT:    v_mov_b32_e32 v16, s4
+; GCN-NEXT:    v_add_co_u32_e32 v56, vcc, v0, v16
+; GCN-NEXT:    v_addc_co_u32_e32 v57, vcc, v1, v17, vcc
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[52:53], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[52:53], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[52:53], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[56:57], off offset:16
+; GCN-NEXT:    v_add_u32_e32 v0, 20, v3
+; GCN-NEXT:    v_add_u32_e32 v1, 0xd0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[56:57], off offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[56:57], off offset:48
+; GCN-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 24, v3
+; GCN-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 28, v3
+; GCN-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 32, v3
+; GCN-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 36, v3
+; GCN-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 40, v3
+; GCN-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 44, v3
+; GCN-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 48, v3
+; GCN-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 52, v3
+; GCN-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 56, v3
+; GCN-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 60, v3
+; GCN-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 64, v3
+; GCN-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x44, v3
+; GCN-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x48, v3
+; GCN-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x4c, v3
+; GCN-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x50, v3
+; GCN-NEXT:    buffer_store_dword v20, v60, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x54, v3
+; GCN-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x58, v3
+; GCN-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x5c, v3
+; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x60, v3
+; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x64, v3
+; GCN-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x68, v3
+; GCN-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x6c, v3
+; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v0, 0x70, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v32
+; GCN-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v13, v33
+; GCN-NEXT:    v_add_u32_e32 v0, 0x74, v3
+; GCN-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v14, v34
+; GCN-NEXT:    v_add_u32_e32 v0, 0x78, v3
+; GCN-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v15, v35
+; GCN-NEXT:    v_add_u32_e32 v0, 0x7c, v3
+; GCN-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x80, v3
+; GCN-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x84, v3
+; GCN-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x88, v3
+; GCN-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x8c, v3
+; GCN-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x90, v3
+; GCN-NEXT:    buffer_store_dword v44, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x94, v3
+; GCN-NEXT:    buffer_store_dword v45, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x98, v3
+; GCN-NEXT:    buffer_store_dword v46, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x9c, v3
+; GCN-NEXT:    buffer_store_dword v47, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xa0, v3
+; GCN-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xa4, v3
+; GCN-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xa8, v3
+; GCN-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xac, v3
+; GCN-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xb0, v3
+; GCN-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xb4, v3
+; GCN-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xb8, v3
+; GCN-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xbc, v3
+; GCN-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xc0, v3
+; GCN-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xc4, v3
+; GCN-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xc8, v3
+; GCN-NEXT:    buffer_store_dword v42, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xcc, v3
+; GCN-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 4, v3
+; GCN-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 8, v3
+; GCN-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 12, v3
+; GCN-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 63, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_add_u32_e32 v0, v3, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-NEXT:    v_add_u32_e32 v1, 0xd4, v3
+; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v6, v10
+; GCN-NEXT:    v_add_u32_e32 v1, 0xd8, v3
+; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v7, v11
+; GCN-NEXT:    v_add_u32_e32 v1, 0xdc, v3
 ; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc4, v0
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v1, 0xe0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, v12
 ; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc8, v0
+; GCN-NEXT:    v_mov_b32_e32 v9, v13
+; GCN-NEXT:    v_add_u32_e32 v1, 0xe4, v3
 ; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xcc, v0
+; GCN-NEXT:    v_mov_b32_e32 v10, v14
+; GCN-NEXT:    v_add_u32_e32 v1, 0xe8, v3
 ; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
-; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 8, v0
-; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 12, v0
-; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
-; GCN-NEXT:    v_add_u32_e32 v3, 0xd0, v0
-; GCN-NEXT:    buffer_store_dword v51, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xd4, v0
-; GCN-NEXT:    buffer_store_dword v52, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xd8, v0
-; GCN-NEXT:    buffer_store_dword v53, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xdc, v0
-; GCN-NEXT:    buffer_store_dword v54, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xe0, v0
-; GCN-NEXT:    buffer_store_dword v55, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xe4, v0
-; GCN-NEXT:    buffer_store_dword v56, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xe8, v0
-; GCN-NEXT:    buffer_store_dword v57, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xec, v0
-; GCN-NEXT:    buffer_store_dword v58, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xf0, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
-; GCN-NEXT:    buffer_store_dword v59, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xf4, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 63, v1
-; GCN-NEXT:    buffer_store_dword v60, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xf8, v0
-; GCN-NEXT:    buffer_store_dword v61, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xfc, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GCN-NEXT:    v_add_u32_e32 v0, v0, v1
-; GCN-NEXT:    buffer_store_dword v62, v3, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v11, v15
+; GCN-NEXT:    v_add_u32_e32 v1, 0xec, v3
+; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xf0, v3
+; GCN-NEXT:    buffer_store_dword v56, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xf4, v3
+; GCN-NEXT:    buffer_store_dword v57, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xf8, v3
+; GCN-NEXT:    buffer_store_dword v58, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xfc, v3
+; GCN-NEXT:    buffer_store_dword v59, v1, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GCN-NEXT:    s_mov_b32 s33, s6
-; GCN-NEXT:    s_waitcnt vmcnt(15)
+; GCN-NEXT:    s_waitcnt vmcnt(13)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -583,22 +657,10 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v32i64_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v15, v0
 ; GCN-NEXT:    s_add_u32 s4, s32, 0x3fc0
-; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
 ; GCN-NEXT:    s_mov_b32 s6, s33
 ; GCN-NEXT:    s_and_b32 s33, s4, 0xffffc000
-; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    v_mov_b32_e32 v12, s5
-; GCN-NEXT:    v_mov_b32_e32 v16, v1
-; GCN-NEXT:    v_add_co_u32_e32 v31, vcc, 64, v15
-; GCN-NEXT:    v_mov_b32_e32 v11, s4
-; GCN-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v16, vcc
-; GCN-NEXT:    v_add_co_u32_e32 v48, vcc, v15, v11
-; GCN-NEXT:    v_addc_co_u32_e32 v49, vcc, v16, v12, vcc
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v12, s5
-; GCN-NEXT:    v_mov_b32_e32 v11, s4
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
@@ -614,232 +676,323 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_add_co_u32_e32 v59, vcc, v15, v11
-; GCN-NEXT:    global_load_dwordx4 v[3:6], v[15:16], off
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[15:16], off offset:16
-; GCN-NEXT:    v_addc_co_u32_e32 v60, vcc, v16, v12, vcc
-; GCN-NEXT:    global_load_dwordx4 v[11:14], v[15:16], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[15:18], v[15:16], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[19:22], v[31:32], off
-; GCN-NEXT:    global_load_dwordx4 v[23:26], v[31:32], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[27:30], v[31:32], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[31:34], v[31:32], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[35:38], v[48:49], off
-; GCN-NEXT:    global_load_dwordx4 v[39:42], v[48:49], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[43:46], v[48:49], off offset:32
-; GCN-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v0, 0x100, v0
-; GCN-NEXT:    v_add_u32_e32 v1, 16, v0
-; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
-; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[3:4], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[3:4], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off
+; GCN-NEXT:    s_movk_i32 s4, 0x80
+; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
+; GCN-NEXT:    s_movk_i32 s4, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v6, s5
+; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
+; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
+; GCN-NEXT:    v_lshrrev_b32_e64 v62, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v62, 0x100, v62
+; GCN-NEXT:    v_add_u32_e32 v2, 16, v62
+; GCN-NEXT:    s_add_u32 s32, s32, 0x14000
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x14000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[47:50], v[48:49], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[43:46], v[59:60], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[51:54], v[59:60], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[55:58], v[59:60], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[59:62], v[59:60], off offset:48
-; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 20, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 24, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 28, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 32, v0
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 36, v0
-; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 40, v0
-; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 44, v0
-; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 48, v0
-; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 52, v0
-; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 56, v0
-; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 60, v0
-; GCN-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 64, v0
-; GCN-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x44, v0
-; GCN-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x48, v0
-; GCN-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x4c, v0
-; GCN-NEXT:    buffer_store_dword v22, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x50, v0
-; GCN-NEXT:    buffer_store_dword v23, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x54, v0
-; GCN-NEXT:    buffer_store_dword v24, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x58, v0
-; GCN-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x5c, v0
-; GCN-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x60, v0
-; GCN-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x64, v0
-; GCN-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x68, v0
-; GCN-NEXT:    buffer_store_dword v29, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6c, v0
-; GCN-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x70, v0
-; GCN-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x74, v0
-; GCN-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x78, v0
-; GCN-NEXT:    buffer_store_dword v33, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x7c, v0
-; GCN-NEXT:    buffer_store_dword v34, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x80, v0
-; GCN-NEXT:    buffer_store_dword v35, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x84, v0
-; GCN-NEXT:    buffer_store_dword v36, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x88, v0
-; GCN-NEXT:    buffer_store_dword v37, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x8c, v0
-; GCN-NEXT:    buffer_store_dword v38, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x90, v0
-; GCN-NEXT:    buffer_store_dword v39, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x94, v0
-; GCN-NEXT:    buffer_store_dword v40, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x98, v0
-; GCN-NEXT:    buffer_store_dword v41, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x9c, v0
-; GCN-NEXT:    buffer_store_dword v42, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa0, v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v15
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v9, v16
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa4, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v10, v17
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa8, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v11, v18
-; GCN-NEXT:    v_add_u32_e32 v1, 0xac, v0
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb0, v0
-; GCN-NEXT:    buffer_store_dword v47, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb4, v0
-; GCN-NEXT:    buffer_store_dword v48, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb8, v0
-; GCN-NEXT:    buffer_store_dword v49, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xbc, v0
-; GCN-NEXT:    buffer_store_dword v50, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc0, v0
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc4, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc8, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xcc, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
-; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 8, v0
-; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 12, v0
-; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
-; GCN-NEXT:    v_add_u32_e32 v1, 0xd0, v0
-; GCN-NEXT:    buffer_store_dword v51, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xd4, v0
-; GCN-NEXT:    buffer_store_dword v52, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xd8, v0
-; GCN-NEXT:    buffer_store_dword v53, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xdc, v0
-; GCN-NEXT:    buffer_store_dword v54, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xe0, v0
-; GCN-NEXT:    buffer_store_dword v55, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xe4, v0
-; GCN-NEXT:    buffer_store_dword v56, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xe8, v0
-; GCN-NEXT:    buffer_store_dword v57, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xec, v0
-; GCN-NEXT:    buffer_store_dword v58, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xf0, v0
-; GCN-NEXT:    buffer_store_dword v59, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xf4, v0
-; GCN-NEXT:    buffer_store_dword v60, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xf8, v0
-; GCN-NEXT:    buffer_store_dword v61, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xfc, v0
-; GCN-NEXT:    buffer_store_dword v62, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_and_b32_e32 v1, 31, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GCN-NEXT:    v_add_u32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:704 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[60:61], off offset:16
+; GCN-NEXT:    v_add_u32_e32 v0, 20, v62
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:708 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:712 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:716 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:720 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:724 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:728 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:732 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:736 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:740 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:744 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:748 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:752 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:756 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:760 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:764 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:768 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[60:61], off offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[60:61], off offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 24, v62
+; GCN-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 28, v62
+; GCN-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 32, v62
+; GCN-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 36, v62
+; GCN-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 40, v62
+; GCN-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 44, v62
+; GCN-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 48, v62
+; GCN-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 52, v62
+; GCN-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 56, v62
+; GCN-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 60, v62
+; GCN-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 64, v62
+; GCN-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x44, v62
+; GCN-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x48, v62
+; GCN-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x4c, v62
+; GCN-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x50, v62
+; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x54, v62
+; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x58, v62
+; GCN-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x5c, v62
+; GCN-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x60, v62
+; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x64, v62
+; GCN-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x68, v62
+; GCN-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x6c, v62
+; GCN-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x70, v62
+; GCN-NEXT:    buffer_store_dword v56, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x74, v62
+; GCN-NEXT:    buffer_store_dword v57, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x78, v62
+; GCN-NEXT:    buffer_store_dword v58, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x7c, v62
+; GCN-NEXT:    buffer_store_dword v59, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x80, v62
+; GCN-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x84, v62
+; GCN-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x88, v62
+; GCN-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x8c, v62
+; GCN-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x90, v62
+; GCN-NEXT:    buffer_store_dword v44, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x94, v62
+; GCN-NEXT:    buffer_store_dword v45, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x98, v62
+; GCN-NEXT:    buffer_store_dword v46, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0x9c, v62
+; GCN-NEXT:    buffer_store_dword v47, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xa0, v62
+; GCN-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xa4, v62
+; GCN-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xa8, v62
+; GCN-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xac, v62
+; GCN-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xb0, v62
+; GCN-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xb4, v62
+; GCN-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xb8, v62
+; GCN-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xbc, v62
+; GCN-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xc0, v62
+; GCN-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xc4, v62
+; GCN-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xc8, v62
+; GCN-NEXT:    buffer_store_dword v42, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xcc, v62
+; GCN-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:704 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v0, 4, v62
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 8, v62
+; GCN-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 12, v62
+; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:708 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:712 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:716 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:720 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:724 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:728 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:732 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:736 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:740 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:744 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:748 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:752 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:756 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:760 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:764 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:768 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v0, 0xd0, v62
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, v5
+; GCN-NEXT:    v_mov_b32_e32 v5, v6
+; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xd4, v62
+; GCN-NEXT:    v_mov_b32_e32 v6, v7
+; GCN-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xd8, v62
+; GCN-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v7, v8
+; GCN-NEXT:    v_add_u32_e32 v0, 0xdc, v62
+; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v0, 0xe0, v62
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, v9
+; GCN-NEXT:    v_mov_b32_e32 v9, v10
+; GCN-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xe4, v62
+; GCN-NEXT:    v_mov_b32_e32 v10, v11
+; GCN-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xe8, v62
+; GCN-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v11, v12
+; GCN-NEXT:    v_add_u32_e32 v0, 0xec, v62
+; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v0, 0xf0, v62
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v13
+; GCN-NEXT:    v_mov_b32_e32 v13, v14
+; GCN-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xf4, v62
+; GCN-NEXT:    v_mov_b32_e32 v14, v15
+; GCN-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v0, 0xf8, v62
+; GCN-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v15, v16
+; GCN-NEXT:    v_add_u32_e32 v0, 0xfc, v62
+; GCN-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 31, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_add_u32_e32 v0, v62, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 02d9d3cfbb85..5f04b3681dae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2520,13 +2520,7 @@ define i32 @v_extract_v64i32_32(<64 x i32> addrspace(1)* %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_32:
 ; GPRIDX:       ; %bb.0:
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_movk_i32 s4, 0x80
-; GPRIDX-NEXT:    s_mov_b32 s5, 0
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v3, s5
-; GPRIDX-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GPRIDX-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GPRIDX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GPRIDX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2551,13 +2545,7 @@ define i32 @v_extract_v64i32_33(<64 x i32> addrspace(1)* %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_33:
 ; GPRIDX:       ; %bb.0:
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_movk_i32 s4, 0x80
-; GPRIDX-NEXT:    s_mov_b32 s5, 0
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v3, s5
-; GPRIDX-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GPRIDX-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GPRIDX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GPRIDX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 2fe0c29e54de..ff28280ba9b4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -52,18 +52,17 @@ bb:
 define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-LABEL: store_load_vindex_kernel:
 ; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vindex_kernel:
@@ -73,15 +72,14 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 4
+; GFX10-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT:    scratch_store_dword v0, v2, off
-; GFX10-NEXT:    scratch_load_dword v0, v1, off
+; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124
 ; GFX10-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -147,9 +145,8 @@ define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
 ; GFX9-LABEL: private_ptr_foo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -157,9 +154,8 @@ define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
@@ -234,12 +230,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x104
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vindex_small_offset_kernel:
@@ -249,17 +244,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x104
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x104
+; GFX10-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX10-NEXT:    s_add_u32 s0, 4, 0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    scratch_load_dword v3, off, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT:    scratch_store_dword v0, v2, off
-; GFX10-NEXT:    scratch_load_dword v0, v1, off
+; GFX10-NEXT:    scratch_load_dword v2, off, s0
+; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124
 ; GFX10-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -401,12 +395,11 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4004
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vindex_large_offset_kernel:
@@ -416,17 +409,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x4004
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x4004
+; GFX10-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX10-NEXT:    s_add_u32 s0, 4, 0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX10-NEXT:    scratch_load_dword v3, off, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT:    scratch_store_dword v0, v2, off
-; GFX10-NEXT:    scratch_load_dword v0, v1, off
+; GFX10-NEXT:    scratch_load_dword v2, off, s0
+; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124
 ; GFX10-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -593,9 +585,8 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
 ; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x400, v0
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
-; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vidx_sidx_offset:
@@ -609,9 +600,8 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
-; GFX10-NEXT:    scratch_load_dword v0, v0, off
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024
 ; GFX10-NEXT:    s_endpgm
 bb:
   %alloca = alloca [32 x i32], align 4, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index 5fc598b3dcbf..d4fac6bcc8eb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -999,21 +999,51 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32* %out, i32* %ptr) #0 {
 }
 
 define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32* %out, i32* %ptr) #0 {
-; GCN-LABEL: flat_atomic_inc_ret_i32_offset:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, 42
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_u32 s2, s2, 16
-; GCN-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    flat_store_dword v[0:1], v2
-; GCN-NEXT:    s_endpgm
+; CI-LABEL: flat_atomic_inc_ret_i32_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s2, s2, 16
+; CI-NEXT:    s_addc_u32 s3, s3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_ret_i32_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s2, s2, 16
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    flat_atomic_inc v2, v[0:1], v2 offset:16 glc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_store_dword v[0:1], v2
+; GFX9-NEXT:    s_endpgm
   %gep = getelementptr i32, i32* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32* %out
@@ -1035,17 +1065,39 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32* %ptr) nounwind {
 }
 
 define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32* %ptr) nounwind {
-; GCN-LABEL: flat_atomic_inc_noret_i32_offset:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, 42
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_u32 s0, s0, 16
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; GCN-NEXT:    s_endpgm
+; CI-LABEL: flat_atomic_inc_noret_i32_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s0, s0, 16
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_noret_i32_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 16
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:16 glc
+; GFX9-NEXT:    s_endpgm
   %gep = getelementptr i32, i32* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
@@ -1097,22 +1149,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 20, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 42
-; GFX9-NEXT:    flat_atomic_inc v2, v[2:3], v4 glc
+; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v4 offset:20 glc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_store_dword v[0:1], v2
+; GFX9-NEXT:    flat_store_dword v[2:3], v0
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32* %ptr, i32 %id
@@ -1163,10 +1213,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 20, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
+; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:20 glc
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32* %ptr, i32 %id
@@ -1257,22 +1305,54 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64* %out, i64* %ptr) #0 {
 }
 
 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr) #0 {
-; GCN-LABEL: flat_atomic_inc_ret_i64_offset:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, 42
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_u32 s2, s2, 32
-; GCN-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GCN-NEXT:    s_endpgm
+; CI-LABEL: flat_atomic_inc_ret_i64_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s2, s2, 32
+; CI-NEXT:    s_addc_u32 s3, s3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_ret_i64_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s2, s2, 32
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_endpgm
   %gep = getelementptr i64, i64* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64* %out
@@ -1295,18 +1375,42 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64* %ptr) nounwind {
 }
 
 define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind {
-; GCN-LABEL: flat_atomic_inc_noret_i64_offset:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, 42
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_u32 s0, s0, 32
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; GCN-NEXT:    s_endpgm
+; CI-LABEL: flat_atomic_inc_noret_i64_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s0, s0, 32
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_noret_i64_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 32
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_endpgm
   %gep = getelementptr i64, i64* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -1360,23 +1464,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 40, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] offset:40 glc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64* %ptr, i32 %id
@@ -1428,12 +1530,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 40, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:40 glc
 ; GFX9-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64* %ptr, i32 %id

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
index 70651280003e..4f0c1586cad1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
@@ -16,13 +16,7 @@ define void @global_atomic_fadd_f32_off_2048(float addrspace(1)* %ptr, float %da
 ; GFX908-LABEL: global_atomic_fadd_f32_off_2048:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_movk_i32 s4, 0x800
-; GFX908-NEXT:    s_mov_b32 s5, 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    v_mov_b32_e32 v4, s5
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2048
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, float addrspace(1)* %ptr, i64 512
@@ -34,13 +28,7 @@ define void @global_atomic_fadd_f32_off_neg2047(float addrspace(1)* %ptr, float
 ; GFX908-LABEL: global_atomic_fadd_f32_off_neg2047:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_mov_b32 s4, 0xfffff804
-; GFX908-NEXT:    s_mov_b32 s5, -1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    v_mov_b32_e32 v4, s5
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, float addrspace(1)* %ptr, i64 -511
@@ -54,12 +42,10 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %pt
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX908-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_add_u32 s0, s0, 0x800
-; GFX908-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s0
-; GFX908-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s2
-; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT:    v_mov_b32_e32 v1, s1
+; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2048
 ; GFX908-NEXT:    s_endpgm
   %gep = getelementptr float, float addrspace(1)* %ptr, i64 512
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
@@ -81,13 +67,7 @@ define void @global_atomic_fadd_v2f16_off_neg2047(<2 x half> addrspace(1)* %ptr,
 ; GFX908-LABEL: global_atomic_fadd_v2f16_off_neg2047:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    s_mov_b32 s4, 0xfffff804
-; GFX908-NEXT:    s_mov_b32 s5, -1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    v_mov_b32_e32 v4, s5
-; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 291f40e4f22a..ff6467afde03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -18,54 +18,52 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
 ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
 ; GFX9-NOUNALIGNED:       ; %bb.0:
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_add_co_u32_e32 v2, vcc, 11, v0
-; GFX9-NOUNALIGNED-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v1, v[2:3], off offset:-10
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v[2:3], off offset:-9
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v5, v[2:3], off offset:-8
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v6, v[2:3], off offset:-7
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v7, v[2:3], off offset:-6
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v[2:3], off offset:-5
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v9, v[2:3], off offset:-4
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v[2:3], off offset:-3
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v[2:3], off offset:-2
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v[2:3], off offset:-1
-; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v3, v[0:1], off offset:1
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v[0:1], off offset:2
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v5, v[0:1], off offset:3
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v6, v[0:1], off offset:4
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v7, v[0:1], off offset:5
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v[0:1], off offset:6
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v9, v[0:1], off offset:7
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v[0:1], off offset:8
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v[0:1], off offset:9
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v[0:1], off offset:10
+; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v0, v[0:1], off offset:11
+; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX9-NOUNALIGNED-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v13, 8
 ; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s5, 8
+; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v13, 8
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v3, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, s4, v5
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v2, s4, v3
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v7, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v8, v8, v3
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v8, v8, v1
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v9, v9, v3
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v9, v9, v1
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_sdwa v11, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v12, v12, v3
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v12, v12, v1
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, v2, v3
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v5
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v5, v6, v3, v7
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v5, v6, s4, v7
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v9
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v3, v10, v3, v11
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v0, v1, v4
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v8, v10, v1, v11
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v10, 24, v0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v2, v3, v4
 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v5, v6, v7
-; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v3, v8, v2
+; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v8, v9, v10
 ; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -156,28 +154,25 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
 ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
 ; GFX9-NOUNALIGNED:       ; %bb.0:
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_add_co_u32_e32 v2, vcc, 10, v0
-; GFX9-NOUNALIGNED-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v1, v[2:3], off offset:-8
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v[2:3], off offset:-6
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v[2:3], off offset:-4
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v6, v[2:3], off offset:-2
-; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v3, v[0:1], off offset:2
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v[0:1], off offset:4
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v[0:1], off offset:6
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v6, v[0:1], off offset:8
+; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v0, v[0:1], off offset:10
 ; GFX9-NOUNALIGNED-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v1, s4, v3
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v5, v5, v3
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v3, s4, v5
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v2, v2, v3
-; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v0, s4, v1
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v1, v4, v3, v5
-; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v6, v3, v2
+; GFX9-NOUNALIGNED-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v0, v2, s4, v1
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v1, v4, s4, v3
+; GFX9-NOUNALIGNED-NEXT:    v_and_or_b32 v2, v6, s4, v5
 ; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2:


        


More information about the llvm-branch-commits mailing list