[llvm] fbae346 - [GlobalISel] Add combine for PTR_ADD with regbanks

Sebastian Neubauer via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 17 04:58:38 PDT 2021


Author: Sebastian Neubauer
Date: 2021-08-17T13:58:16+02:00
New Revision: fbae34635d83c106f99ccd11a53305915929bb9a

URL: https://github.com/llvm/llvm-project/commit/fbae34635d83c106f99ccd11a53305915929bb9a
DIFF: https://github.com/llvm/llvm-project/commit/fbae34635d83c106f99ccd11a53305915929bb9a.diff

LOG: [GlobalISel] Add combine for PTR_ADD with regbanks

Combine two G_PTR_ADDs, but keep the register bank of the constant.
That way, the combine can be used in post-regbank-select combines.

Introduce two helper methods in CombinerHelper, getRegBank and
setRegBank that get and set an optional register bank to a register.
That way, they can be used before and after register bank selection.

Differential Revision: https://reviews.llvm.org/D103326

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
    llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
    llvm/lib/Target/AMDGPU/AMDGPUCombine.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 8855631859fcf..d892a7525a6d3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -36,7 +36,10 @@ class GISelKnownBits;
 class MachineDominatorTree;
 class LegalizerInfo;
 struct LegalityQuery;
+class RegisterBank;
+class RegisterBankInfo;
 class TargetLowering;
+class TargetRegisterInfo;
 
 struct PreferredTuple {
   LLT Ty;                // The result type of the extend.
@@ -54,6 +57,7 @@ struct IndexedLoadStoreMatchInfo {
 struct PtrAddChain {
   int64_t Imm;
   Register Base;
+  const RegisterBank *Bank;
 };
 
 struct RegisterImmPair {
@@ -95,6 +99,8 @@ class CombinerHelper {
   GISelKnownBits *KB;
   MachineDominatorTree *MDT;
   const LegalizerInfo *LI;
+  const RegisterBankInfo *RBI;
+  const TargetRegisterInfo *TRI;
 
 public:
   CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
@@ -120,6 +126,18 @@ class CombinerHelper {
   void replaceRegOpWith(MachineRegisterInfo &MRI, MachineOperand &FromRegOp,
                         Register ToReg) const;
 
+  /// Get the register bank of \p Reg.
+  /// If Reg has not been assigned a register, a register class,
+  /// or a register bank, then this returns nullptr.
+  ///
+  /// \pre Reg.isValid()
+  const RegisterBank *getRegBank(Register Reg) const;
+
+  /// Set the register bank of \p Reg.
+  /// Does nothing if the RegBank is null.
+  /// This is the counterpart to getRegBank.
+  void setRegBank(Register Reg, const RegisterBank *RegBank);
+
   /// If \p MI is COPY, try to combine it.
   /// Returns true if MI changed.
   bool tryCombineCopy(MachineInstr &MI);

diff  --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 732b7ed5dd9d6..949ecacbffd90 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -46,8 +47,9 @@ CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
                                MachineIRBuilder &B, GISelKnownBits *KB,
                                MachineDominatorTree *MDT,
                                const LegalizerInfo *LI)
-    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
-      KB(KB), MDT(MDT), LI(LI) {
+    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), KB(KB),
+      MDT(MDT), LI(LI), RBI(Builder.getMF().getSubtarget().getRegBankInfo()),
+      TRI(Builder.getMF().getSubtarget().getRegisterInfo()) {
   (void)this->KB;
 }
 
@@ -143,6 +145,15 @@ void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
   Observer.changedInstr(*FromRegOp.getParent());
 }
 
+const RegisterBank *CombinerHelper::getRegBank(Register Reg) const {
+  return RBI->getRegBank(Reg, MRI, *TRI);
+}
+
+void CombinerHelper::setRegBank(Register Reg, const RegisterBank *RegBank) {
+  if (RegBank)
+    MRI.setRegBank(Reg, *RegBank);
+}
+
 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
   if (matchCombineCopy(MI)) {
     applyCombineCopy(MI);
@@ -1407,7 +1418,6 @@ bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
-    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->hasStackRealignment(MF))
       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
         NewAlign = NewAlign / 2;
@@ -1512,7 +1522,6 @@ bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
-    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->hasStackRealignment(MF))
       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
         NewAlign = NewAlign / 2;
@@ -1710,7 +1719,7 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
   if (!MaybeImmVal)
     return false;
 
-  MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2);
+  MachineInstr *Add2Def = MRI.getVRegDef(Add2);
   if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD)
     return false;
 
@@ -1751,6 +1760,7 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
   // Pass the combined immediate to the apply function.
   MatchInfo.Imm = AMNew.BaseOffs;
   MatchInfo.Base = Base;
+  MatchInfo.Bank = getRegBank(Imm2);
   return true;
 }
 
@@ -1760,6 +1770,7 @@ void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
   MachineIRBuilder MIB(MI);
   LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg());
   auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm);
+  setRegBank(NewOffset.getReg(0), MatchInfo.Bank);
   Observer.changingInstr(MI);
   MI.getOperand(1).setReg(MatchInfo.Base);
   MI.getOperand(2).setReg(NewOffset.getReg(0));

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index c6273adca50f7..28946435af467 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -83,7 +83,7 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
 }
 
 def AMDGPURegBankCombinerHelper : GICombinerHelper<
-  "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> {
+  "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> {
   let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
   let StateClass = "AMDGPURegBankCombinerHelperState";
   let AdditionalArguments = [];

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 1b146ddc72b91..d4c1670b1c56d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -8,174 +8,191 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v64i32_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, s33
+; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
-; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
-; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_mov_b32_e32 v4, s5
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v4, vcc
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:48
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v6, s5
-; GCN-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
-; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
-; GCN-NEXT:    v_and_b32_e32 v0, 63, v2
-; GCN-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_add_u32_e32 v1, 0x100, v1
-; GCN-NEXT:    v_add_u32_e32 v0, v1, v0
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[19:22], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[23:26], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[27:30], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[31:34], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[35:38], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[39:42], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[0:1], off offset:160
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:176
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:192
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[60:61], off offset:32
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[51:54], v[0:1], off offset:208
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v[60:61], off offset:48
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[55:58], v[0:1], off offset:224
+; GCN-NEXT:    global_load_dwordx4 v[59:62], v[0:1], off offset:240
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    v_and_b32_e32 v0, 63, v2
+; GCN-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_add_u32_e32 v1, 0x100, v1
+; GCN-NEXT:    v_add_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v20
-; GCN-NEXT:    v_mov_b32_e32 v13, v21
-; GCN-NEXT:    v_mov_b32_e32 v14, v22
-; GCN-NEXT:    v_mov_b32_e32 v15, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v15
+; GCN-NEXT:    v_mov_b32_e32 v13, v16
+; GCN-NEXT:    v_mov_b32_e32 v14, v17
+; GCN-NEXT:    v_mov_b32_e32 v15, v18
 ; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
 ; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
 ; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
 ; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:460
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
@@ -193,36 +210,39 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v11
-; GCN-NEXT:    v_mov_b32_e32 v9, v12
-; GCN-NEXT:    v_mov_b32_e32 v10, v13
-; GCN-NEXT:    v_mov_b32_e32 v11, v14
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_mov_b32_e32 v4, v7
+; GCN-NEXT:    v_mov_b32_e32 v5, v8
+; GCN-NEXT:    v_mov_b32_e32 v6, v9
+; GCN-NEXT:    v_mov_b32_e32 v7, v10
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b32 s33, s6
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
@@ -234,173 +254,190 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v128i16_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, s33
+; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
-; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
-; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_mov_b32_e32 v4, s5
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v4, vcc
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:48
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v6, s5
-; GCN-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
-; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
-; GCN-NEXT:    v_bfe_u32 v0, v2, 1, 6
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[19:22], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[23:26], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[27:30], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[31:34], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[35:38], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[39:42], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[0:1], off offset:160
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:176
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:192
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[60:61], off offset:32
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[51:54], v[0:1], off offset:208
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v[60:61], off offset:48
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[55:58], v[0:1], off offset:224
+; GCN-NEXT:    global_load_dwordx4 v[59:62], v[0:1], off offset:240
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    v_bfe_u32 v0, v2, 1, 6
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v20
-; GCN-NEXT:    v_mov_b32_e32 v13, v21
-; GCN-NEXT:    v_mov_b32_e32 v14, v22
-; GCN-NEXT:    v_mov_b32_e32 v15, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v15
+; GCN-NEXT:    v_mov_b32_e32 v13, v16
+; GCN-NEXT:    v_mov_b32_e32 v14, v17
+; GCN-NEXT:    v_mov_b32_e32 v15, v18
 ; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
 ; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
 ; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
 ; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:460
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
@@ -418,40 +455,43 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v11
-; GCN-NEXT:    v_mov_b32_e32 v9, v12
-; GCN-NEXT:    v_mov_b32_e32 v10, v13
-; GCN-NEXT:    v_mov_b32_e32 v11, v14
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:508
-; GCN-NEXT:    v_lshrrev_b32_e64 v11, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v11, 0x100, v11
-; GCN-NEXT:    v_add_u32_e32 v0, v11, v0
+; GCN-NEXT:    v_mov_b32_e32 v4, v7
+; GCN-NEXT:    v_mov_b32_e32 v5, v8
+; GCN-NEXT:    v_mov_b32_e32 v6, v9
+; GCN-NEXT:    v_mov_b32_e32 v7, v10
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_lshrrev_b32_e64 v7, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v7, 0x100, v7
+; GCN-NEXT:    v_add_u32_e32 v0, v7, v0
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b32 s33, s6
-; GCN-NEXT:    s_waitcnt vmcnt(16)
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(15)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -464,174 +504,191 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v32i64_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, s33
+; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
-; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
-; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_mov_b32_e32 v4, s5
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v4, vcc
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:48
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v6, s5
-; GCN-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
-; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
-; GCN-NEXT:    v_and_b32_e32 v0, 31, v2
-; GCN-NEXT:    v_lshrrev_b32_e64 v2, 6, s33
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_add_u32_e32 v2, 0x100, v2
-; GCN-NEXT:    v_add_u32_e32 v1, v2, v0
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[19:22], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[23:26], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[27:30], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[31:34], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[35:38], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[39:42], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[0:1], off offset:160
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:176
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:192
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[60:61], off offset:32
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[51:54], v[0:1], off offset:208
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v[60:61], off offset:48
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v20
-; GCN-NEXT:    v_mov_b32_e32 v13, v21
-; GCN-NEXT:    v_mov_b32_e32 v14, v22
-; GCN-NEXT:    v_mov_b32_e32 v15, v23
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[55:58], v[0:1], off offset:224
+; GCN-NEXT:    global_load_dwordx4 v[59:62], v[0:1], off offset:240
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    v_and_b32_e32 v0, 31, v2
+; GCN-NEXT:    v_lshrrev_b32_e64 v2, 6, s33
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x100, v2
+; GCN-NEXT:    v_add_u32_e32 v1, v2, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v15
+; GCN-NEXT:    v_mov_b32_e32 v13, v16
+; GCN-NEXT:    v_mov_b32_e32 v14, v17
+; GCN-NEXT:    v_mov_b32_e32 v15, v18
 ; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
 ; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
 ; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
 ; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:460
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
@@ -649,37 +706,40 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v11
-; GCN-NEXT:    v_mov_b32_e32 v9, v12
-; GCN-NEXT:    v_mov_b32_e32 v10, v13
-; GCN-NEXT:    v_mov_b32_e32 v11, v14
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_mov_b32_e32 v4, v7
+; GCN-NEXT:    v_mov_b32_e32 v5, v8
+; GCN-NEXT:    v_mov_b32_e32 v6, v9
+; GCN-NEXT:    v_mov_b32_e32 v7, v10
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b32 s33, s6
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <32 x i64>, <32 x i64> addrspace(1)* %ptr

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 2dedb531bc1bb..548debc54788b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -4138,12 +4138,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_37:
 ; GPRIDX:       ; %bb.0:
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], 0x80
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v3, s5
-; GPRIDX-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GPRIDX-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GPRIDX-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GPRIDX-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:144
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, v5
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
@@ -4151,12 +4146,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
 ; MOVREL-LABEL: v_extract_v64i32_37:
 ; MOVREL:       ; %bb.0:
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], 0x80
-; MOVREL-NEXT:    v_mov_b32_e32 v2, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v3, s5
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x90, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
@@ -4167,12 +4157,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x80
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:144
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
index 04120780f8e19..e2e5e3369bd9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -6,123 +6,96 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GCN-LABEL: v_insert_v64i32_37:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-NEXT:    v_lshlrev_b32_e32 v68, 8, v0
-; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v64, 8, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v68
-; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 64, v2
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:48
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_add_co_u32_e32 v64, vcc, v2, v0
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_addc_co_u32_e32 v65, vcc, v3, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_add_co_u32_e32 v66, vcc, v2, v0
-; GCN-NEXT:    v_addc_co_u32_e32 v67, vcc, v3, v1, vcc
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v68, s[0:1]
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v68, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v68, s[0:1] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v68, s[0:1] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v68, s[0:1] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[64:65], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[64:65], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[64:65], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[66:67], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[66:67], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[66:67], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v68, s[0:1] offset:128
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v68, s[0:1] offset:192
-; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v64, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v64, s[0:1] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v64, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v64, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v64, s[0:1] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v64, s[0:1] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v64, s[0:1] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[60:63], v64, s[0:1] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v64, s[0:1] offset:128
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v64, s[0:1] offset:144
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v64, s[0:1] offset:160
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v64, s[0:1] offset:176
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v64, s[0:1] offset:208
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v64, s[0:1] offset:224
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v64, s[0:1] offset:240
+; GCN-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0x3e7
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    global_store_dwordx4 v68, v[0:3], s[2:3] offset:128
-; GCN-NEXT:    global_store_dwordx4 v68, v[4:7], s[2:3] offset:144
-; GCN-NEXT:    global_store_dwordx4 v68, v[8:11], s[2:3] offset:160
-; GCN-NEXT:    global_store_dwordx4 v68, v[12:15], s[2:3] offset:176
-; GCN-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NEXT:    global_store_dwordx4 v68, v[16:19], s[2:3] offset:192
-; GCN-NEXT:    global_store_dwordx4 v68, v[20:23], s[2:3] offset:208
-; GCN-NEXT:    global_store_dwordx4 v68, v[24:27], s[2:3] offset:224
-; GCN-NEXT:    global_store_dwordx4 v68, v[44:47], s[2:3]
-; GCN-NEXT:    global_store_dwordx4 v68, v[48:51], s[2:3] offset:16
-; GCN-NEXT:    global_store_dwordx4 v68, v[52:55], s[2:3] offset:32
-; GCN-NEXT:    global_store_dwordx4 v68, v[56:59], s[2:3] offset:48
-; GCN-NEXT:    global_store_dwordx4 v68, v[60:63], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v68, v[28:31], s[2:3] offset:240
-; GCN-NEXT:    global_store_dwordx4 v68, v[32:35], s[2:3] offset:80
-; GCN-NEXT:    global_store_dwordx4 v68, v[36:39], s[2:3] offset:96
-; GCN-NEXT:    global_store_dwordx4 v68, v[40:43], s[2:3] offset:112
+; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
+; GCN-NEXT:    global_store_dwordx4 v64, v[4:7], s[2:3] offset:144
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[8:11], s[2:3] offset:160
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[12:15], s[2:3] offset:176
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[2:3] offset:192
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[2:3] offset:208
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[2:3] offset:224
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
+; GCN-NEXT:    global_store_dwordx4 v64, v[32:35], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v64, v[36:39], s[2:3] offset:16
+; GCN-NEXT:    global_store_dwordx4 v64, v[40:43], s[2:3] offset:32
+; GCN-NEXT:    global_store_dwordx4 v64, v[44:47], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v64, v[48:51], s[2:3] offset:64
+; GCN-NEXT:    global_store_dwordx4 v64, v[52:55], s[2:3] offset:80
+; GCN-NEXT:    global_store_dwordx4 v64, v[56:59], s[2:3] offset:96
+; GCN-NEXT:    global_store_dwordx4 v64, v[60:63], s[2:3] offset:112
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_insert_v64i32_37:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 8, v0
-; GFX10-NEXT:    s_movk_i32 s4, 0x80
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    s_movk_i32 s4, 0xc0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 8, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, s1
-; GFX10-NEXT:    v_mov_b32_e32 v5, s0
-; GFX10-NEXT:    s_clause 0x4
-; GFX10-NEXT:    global_load_dwordx4 v[32:35], v70, s[0:1]
-; GFX10-NEXT:    global_load_dwordx4 v[36:39], v70, s[0:1] offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[40:43], v70, s[0:1] offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[44:47], v70, s[0:1] offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[48:51], v70, s[0:1] offset:64
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v5, v70
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v64, vcc_lo, v0, 64
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v65, vcc_lo, 0, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v66, vcc_lo, v0, v1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v67, vcc_lo, v5, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v68, vcc_lo, v0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v69, vcc_lo, v5, v4, vcc_lo
-; GFX10-NEXT:    s_clause 0xa
-; GFX10-NEXT:    global_load_dwordx4 v[52:55], v[64:65], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[56:59], v[64:65], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[60:63], v[64:65], off offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[66:67], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[66:67], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[66:67], off offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[68:69], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[24:27], v[68:69], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[68:69], off offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v70, s[0:1] offset:128
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v70, s[0:1] offset:192
-; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    s_clause 0xf
+; GFX10-NEXT:    global_load_dwordx4 v[32:35], v64, s[0:1]
+; GFX10-NEXT:    global_load_dwordx4 v[36:39], v64, s[0:1] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[40:43], v64, s[0:1] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[44:47], v64, s[0:1] offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[48:51], v64, s[0:1] offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[52:55], v64, s[0:1] offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[56:59], v64, s[0:1] offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[60:63], v64, s[0:1] offset:112
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v64, s[0:1] offset:128
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v64, s[0:1] offset:144
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v64, s[0:1] offset:160
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v64, s[0:1] offset:176
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v64, s[0:1] offset:208
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v64, s[0:1] offset:224
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v64, s[0:1] offset:240
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0x3e7
+; GFX10-NEXT:    global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
+; GFX10-NEXT:    global_store_dwordx4 v64, v[4:7], s[2:3] offset:144
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    global_store_dwordx4 v64, v[8:11], s[2:3] offset:160
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    global_store_dwordx4 v64, v[12:15], s[2:3] offset:176
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    global_store_dwordx4 v64, v[16:19], s[2:3] offset:192
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    global_store_dwordx4 v64, v[20:23], s[2:3] offset:208
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    global_store_dwordx4 v70, v[0:3], s[2:3] offset:128
-; GFX10-NEXT:    global_store_dwordx4 v70, v[4:7], s[2:3] offset:144
-; GFX10-NEXT:    global_store_dwordx4 v70, v[8:11], s[2:3] offset:160
-; GFX10-NEXT:    global_store_dwordx4 v70, v[12:15], s[2:3] offset:176
+; GFX10-NEXT:    global_store_dwordx4 v64, v[24:27], s[2:3] offset:224
+; GFX10-NEXT:    global_store_dwordx4 v64, v[32:35], s[2:3]
+; GFX10-NEXT:    global_store_dwordx4 v64, v[36:39], s[2:3] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v64, v[40:43], s[2:3] offset:32
+; GFX10-NEXT:    global_store_dwordx4 v64, v[44:47], s[2:3] offset:48
+; GFX10-NEXT:    global_store_dwordx4 v64, v[48:51], s[2:3] offset:64
+; GFX10-NEXT:    global_store_dwordx4 v64, v[52:55], s[2:3] offset:80
+; GFX10-NEXT:    global_store_dwordx4 v64, v[56:59], s[2:3] offset:96
+; GFX10-NEXT:    global_store_dwordx4 v64, v[60:63], s[2:3] offset:112
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v70, v[16:19], s[2:3] offset:192
-; GFX10-NEXT:    global_store_dwordx4 v70, v[20:23], s[2:3] offset:208
-; GFX10-NEXT:    global_store_dwordx4 v70, v[24:27], s[2:3] offset:224
-; GFX10-NEXT:    global_store_dwordx4 v70, v[32:35], s[2:3]
-; GFX10-NEXT:    global_store_dwordx4 v70, v[36:39], s[2:3] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v70, v[40:43], s[2:3] offset:32
-; GFX10-NEXT:    global_store_dwordx4 v70, v[44:47], s[2:3] offset:48
-; GFX10-NEXT:    global_store_dwordx4 v70, v[48:51], s[2:3] offset:64
-; GFX10-NEXT:    global_store_dwordx4 v70, v[52:55], s[2:3] offset:80
-; GFX10-NEXT:    global_store_dwordx4 v70, v[56:59], s[2:3] offset:96
-; GFX10-NEXT:    global_store_dwordx4 v70, v[60:63], s[2:3] offset:112
-; GFX10-NEXT:    global_store_dwordx4 v70, v[28:31], s[2:3] offset:240
+; GFX10-NEXT:    global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
 ; GFX10-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id


        


More information about the llvm-commits mailing list