[llvm] 3359ea6 - [Scheduling] Create the missing dependency edges for store cluster

QingShan Zhang via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 6 21:58:14 PDT 2020


Author: QingShan Zhang
Date: 2020-08-07T04:58:03Z
New Revision: 3359ea62edcc5f1d5831bebc2075746031cd22c5

URL: https://github.com/llvm/llvm-project/commit/3359ea62edcc5f1d5831bebc2075746031cd22c5
DIFF: https://github.com/llvm/llvm-project/commit/3359ea62edcc5f1d5831bebc2075746031cd22c5.diff

LOG: [Scheduling] Create the missing dependency edges for store cluster

If it is load cluster, we don't need to create the dependency edges(SUb->reg) from SUb to SUa
as they both depend on the base register "reg"

     +-------+
+---->  reg  |
|    +---+---+
|        ^
|        |
|        |
|        |
|    +---+---+
|    |  SUa  |  Load 0(reg)
|    +---+---+
|        ^
|        |
|        |
|    +---+---+
+----+  SUb  |  Load 4(reg)
     +-------+

But if it is store cluster, we need to create it as follow shows to avoid the instruction store
depend on scheduled in-between SUb and SUa.

     +-------+
+---->  reg  |
|    +---+---+
|        ^
|        |         Missing       +-------+
|        | +-------------------->+   y   |
|        | |                     +---+---+
|    +---+-+-+                       ^
|    |  SUa  |  Store x 0(reg)       |
|    +---+---+                       |
|        ^                           |
|        |  +------------------------+
|        |  |
|    +---+--++
+----+  SUb  |  Store y 4(reg)
     +-------+

Reviewed By: evandro, arsenm, rampitec, foad, fhahn

Differential Revision: https://reviews.llvm.org/D72031

Added: 
    

Modified: 
    llvm/lib/CodeGen/MachineScheduler.cpp
    llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/half.ll
    llvm/test/CodeGen/AMDGPU/stack-realign.ll
    llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 7daaa3526aa3..fe4ceb292292 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1624,16 +1624,32 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
                       << SUb->NodeNum << ")\n");
 
-    // Copy successor edges from SUa to SUb. Interleaving computation
-    // dependent on SUa can prevent load combining due to register reuse.
-    // Predecessor edges do not need to be copied from SUb to SUa since
-    // nearby loads should have effectively the same inputs.
-    for (const SDep &Succ : SUa->Succs) {
-      if (Succ.getSUnit() == SUb)
-        continue;
-      LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
-                        << ")\n");
-      DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+    if (IsLoad) {
+      // Copy successor edges from SUa to SUb. Interleaving computation
+      // dependent on SUa can prevent load combining due to register reuse.
+      // Predecessor edges do not need to be copied from SUb to SUa since
+      // nearby loads should have effectively the same inputs.
+      for (const SDep &Succ : SUa->Succs) {
+        if (Succ.getSUnit() == SUb)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
+                          << ")\n");
+        DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+      }
+    } else {
+      // Copy predecessor edges from SUb to SUa to avoid the SUnits that
+      // SUb dependent on scheduled in-between SUb and SUa. Successor edges
+      // do not need to be copied from SUa to SUb since no one will depend
+      // on stores.
+      // Notice that, we don't need to care about the memory dependency as
+      // we won't try to cluster them if they have any memory dependency.
+      for (const SDep &Pred : SUb->Preds) {
+        if (Pred.getSUnit() == SUa)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Copy Pred SU(" << Pred.getSUnit()->NodeNum
+                          << ")\n");
+        DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
+      }
     }
 
     LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
index bfe7e4941da8..e821e8504d96 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -194,3 +194,22 @@ entry:
   store i64 %add6.3, i64* %arrayidx5.3, align 8
   ret void
 }
+
+; Verify that the SU(2) and SU(4) are the preds of SU(3)
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_missing_preds_edges:%bb.0
+; CHECK:Cluster ld/st SU(3) - SU(5)
+; CHECK: Copy Pred SU(4)
+; CHECK: Copy Pred SU(2)
+; CHECK:SU(2):   %0:gpr64common = COPY $x0
+; CHECK:SU(3):   STRWui %1:gpr32, %0:gpr64common, 0
+; CHECK:SU(4):   %3:gpr32common = nsw ADDWri %2:gpr32common, 5, 0
+; CHECK:SU(5):   STRWui %3:gpr32common, %0:gpr64common, 1
+define void @stp_missing_preds_edges(i32* %p, i32 %m, i32 %n) {
+entry:
+  store i32 %m, i32* %p, align 4
+  %add = add nsw i32 %n, 5
+  %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1
+  store i32 %add, i32* %arrayidx1, align 4
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index d2d9bea66089..909c05925e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -40,6 +40,7 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_add_co_u32_e32 v59, vcc, v15, v11
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[3:6], v[15:16], off
 ; GCN-NEXT:    global_load_dwordx4 v[7:10], v[15:16], off offset:16
 ; GCN-NEXT:    v_addc_co_u32_e32 v60, vcc, v16, v12, vcc
@@ -55,212 +56,214 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GCN-NEXT:    v_add_u32_e32 v0, 0x100, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 16, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 20, v0
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[47:50], v[48:49], off offset:48
 ; GCN-NEXT:    global_load_dwordx4 v[43:46], v[59:60], off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[51:54], v[59:60], off offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[55:58], v[59:60], off offset:32
 ; GCN-NEXT:    global_load_dwordx4 v[59:62], v[59:60], off offset:48
 ; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 20, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 24, v0
+; GCN-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 28, v0
 ; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 28, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 32, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 36, v0
 ; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 36, v0
-; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 40, v0
+; GCN-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 44, v0
 ; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 44, v0
-; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 48, v0
+; GCN-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 52, v0
 ; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 52, v0
-; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 56, v0
+; GCN-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 60, v0
 ; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 60, v0
-; GCN-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 64, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x44, v0
 ; GCN-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x44, v0
-; GCN-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x48, v0
+; GCN-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x4c, v0
 ; GCN-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x4c, v0
-; GCN-NEXT:    buffer_store_dword v22, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x50, v0
+; GCN-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x54, v0
 ; GCN-NEXT:    buffer_store_dword v23, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x54, v0
-; GCN-NEXT:    buffer_store_dword v24, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x58, v0
+; GCN-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x5c, v0
 ; GCN-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x5c, v0
-; GCN-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x60, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x64, v0
 ; GCN-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x64, v0
-; GCN-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x68, v0
+; GCN-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x6c, v0
 ; GCN-NEXT:    buffer_store_dword v29, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6c, v0
-; GCN-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x70, v0
+; GCN-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x74, v0
 ; GCN-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x74, v0
-; GCN-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x78, v0
+; GCN-NEXT:    buffer_store_dword v32, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x7c, v0
 ; GCN-NEXT:    buffer_store_dword v33, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x7c, v0
-; GCN-NEXT:    buffer_store_dword v34, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v34, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x80, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x84, v0
 ; GCN-NEXT:    buffer_store_dword v35, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x84, v0
-; GCN-NEXT:    buffer_store_dword v36, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x88, v0
+; GCN-NEXT:    buffer_store_dword v36, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x8c, v0
 ; GCN-NEXT:    buffer_store_dword v37, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x8c, v0
-; GCN-NEXT:    buffer_store_dword v38, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x90, v0
+; GCN-NEXT:    buffer_store_dword v38, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x94, v0
 ; GCN-NEXT:    buffer_store_dword v39, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x94, v0
-; GCN-NEXT:    buffer_store_dword v40, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x98, v0
+; GCN-NEXT:    buffer_store_dword v40, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x9c, v0
 ; GCN-NEXT:    buffer_store_dword v41, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x9c, v0
-; GCN-NEXT:    buffer_store_dword v42, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v42, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xa0, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0xa4, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v8, v15
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v9, v16
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa4, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v10, v17
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xa8, v0
+; GCN-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v11, v18
-; GCN-NEXT:    v_add_u32_e32 v1, 0xac, v0
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xb0, v0
+; GCN-NEXT:    v_mov_b32_e32 v11, v18
+; GCN-NEXT:    v_add_u32_e32 v2, 0xac, v0
+; GCN-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0xb4, v0
 ; GCN-NEXT:    buffer_store_dword v47, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb4, v0
-; GCN-NEXT:    buffer_store_dword v48, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xb8, v0
+; GCN-NEXT:    buffer_store_dword v48, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0xbc, v0
 ; GCN-NEXT:    buffer_store_dword v49, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xbc, v0
-; GCN-NEXT:    buffer_store_dword v50, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v50, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v2, 0xc4, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xc0, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0xcc, v0
 ; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc4, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xc8, v0
+; GCN-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 4, v0
+; GCN-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xcc, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
-; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 8, v0
-; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 12, v0
-; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xd0, v0
-; GCN-NEXT:    v_add_u32_e32 v3, 0xd4, v0
-; GCN-NEXT:    v_add_u32_e32 v4, 0xd8, v0
-; GCN-NEXT:    v_add_u32_e32 v5, 0xdc, v0
+; GCN-NEXT:    v_add_u32_e32 v7, 8, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 12, v0
+; GCN-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    v_add_u32_e32 v2, 0xd4, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 0xd8, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xdc, v0
 ; GCN-NEXT:    buffer_store_dword v51, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v52, v3, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v53, v4, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v54, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v52, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v53, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v54, v4, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xe0, v0
-; GCN-NEXT:    v_add_u32_e32 v3, 0xe4, v0
-; GCN-NEXT:    v_add_u32_e32 v4, 0xe8, v0
-; GCN-NEXT:    v_add_u32_e32 v5, 0xec, v0
-; GCN-NEXT:    v_add_u32_e32 v6, 0xf0, v0
-; GCN-NEXT:    v_add_u32_e32 v7, 0xf4, v0
-; GCN-NEXT:    v_add_u32_e32 v8, 0xf8, v0
-; GCN-NEXT:    v_add_u32_e32 v9, 0xfc, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0xe4, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 0xe8, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xec, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0xf0, v0
+; GCN-NEXT:    v_add_u32_e32 v6, 0xf4, v0
+; GCN-NEXT:    v_add_u32_e32 v7, 0xf8, v0
+; GCN-NEXT:    v_add_u32_e32 v8, 0xfc, v0
 ; GCN-NEXT:    buffer_store_dword v55, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v56, v3, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v57, v4, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v58, v5, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v59, v6, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v60, v7, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v61, v8, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v62, v9, s[0:3], 0 offen
-; GCN-NEXT:    v_and_b32_e32 v1, 63, v2
+; GCN-NEXT:    buffer_store_dword v56, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v57, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v58, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v59, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v60, v6, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v61, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v62, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 63, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GCN-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
@@ -323,6 +326,7 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_add_co_u32_e32 v59, vcc, v15, v11
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[3:6], v[15:16], off
 ; GCN-NEXT:    global_load_dwordx4 v[7:10], v[15:16], off offset:16
 ; GCN-NEXT:    v_addc_co_u32_e32 v60, vcc, v16, v12, vcc
@@ -338,215 +342,217 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GCN-NEXT:    v_add_u32_e32 v0, 0x100, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 16, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 20, v0
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[47:50], v[48:49], off offset:48
 ; GCN-NEXT:    global_load_dwordx4 v[43:46], v[59:60], off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[51:54], v[59:60], off offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[55:58], v[59:60], off offset:32
 ; GCN-NEXT:    global_load_dwordx4 v[59:62], v[59:60], off offset:48
 ; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 20, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 24, v0
+; GCN-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 28, v0
 ; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 28, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 32, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 36, v0
 ; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 36, v0
-; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 40, v0
+; GCN-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 44, v0
 ; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 44, v0
-; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 48, v0
+; GCN-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 52, v0
 ; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 52, v0
-; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 56, v0
+; GCN-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 60, v0
 ; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 60, v0
-; GCN-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 64, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x44, v0
 ; GCN-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x44, v0
-; GCN-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x48, v0
+; GCN-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x4c, v0
 ; GCN-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x4c, v0
-; GCN-NEXT:    buffer_store_dword v22, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x50, v0
+; GCN-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x54, v0
 ; GCN-NEXT:    buffer_store_dword v23, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x54, v0
-; GCN-NEXT:    buffer_store_dword v24, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x58, v0
+; GCN-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x5c, v0
 ; GCN-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x5c, v0
-; GCN-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x60, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x64, v0
 ; GCN-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x64, v0
-; GCN-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x68, v0
+; GCN-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x6c, v0
 ; GCN-NEXT:    buffer_store_dword v29, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6c, v0
-; GCN-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x70, v0
+; GCN-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x74, v0
 ; GCN-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x74, v0
-; GCN-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x78, v0
+; GCN-NEXT:    buffer_store_dword v32, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x7c, v0
 ; GCN-NEXT:    buffer_store_dword v33, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x7c, v0
-; GCN-NEXT:    buffer_store_dword v34, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v34, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x80, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x84, v0
 ; GCN-NEXT:    buffer_store_dword v35, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x84, v0
-; GCN-NEXT:    buffer_store_dword v36, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x88, v0
+; GCN-NEXT:    buffer_store_dword v36, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x8c, v0
 ; GCN-NEXT:    buffer_store_dword v37, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x8c, v0
-; GCN-NEXT:    buffer_store_dword v38, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x90, v0
+; GCN-NEXT:    buffer_store_dword v38, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x94, v0
 ; GCN-NEXT:    buffer_store_dword v39, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x94, v0
-; GCN-NEXT:    buffer_store_dword v40, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x98, v0
+; GCN-NEXT:    buffer_store_dword v40, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x9c, v0
 ; GCN-NEXT:    buffer_store_dword v41, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x9c, v0
-; GCN-NEXT:    buffer_store_dword v42, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v42, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xa0, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0xa4, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v8, v15
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v9, v16
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa4, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v10, v17
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xa8, v0
+; GCN-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v11, v18
-; GCN-NEXT:    v_add_u32_e32 v1, 0xac, v0
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xb0, v0
+; GCN-NEXT:    v_mov_b32_e32 v11, v18
+; GCN-NEXT:    v_add_u32_e32 v2, 0xac, v0
+; GCN-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0xb4, v0
 ; GCN-NEXT:    buffer_store_dword v47, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb4, v0
-; GCN-NEXT:    buffer_store_dword v48, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xb8, v0
+; GCN-NEXT:    buffer_store_dword v48, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0xbc, v0
 ; GCN-NEXT:    buffer_store_dword v49, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xbc, v0
-; GCN-NEXT:    buffer_store_dword v50, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v50, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v2, 0xc4, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xc0, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0xcc, v0
 ; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc4, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xc8, v0
+; GCN-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 4, v0
+; GCN-NEXT:    v_add_u32_e32 v7, 8, v0
 ; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xcc, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
-; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 8, v0
-; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 12, v0
-; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 12, v0
+; GCN-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
-; GCN-NEXT:    v_and_b32_e32 v1, 63, v1
-; GCN-NEXT:    v_add_u32_e32 v3, 0xd0, v0
-; GCN-NEXT:    v_add_u32_e32 v4, 0xd4, v0
-; GCN-NEXT:    v_add_u32_e32 v5, 0xd8, v0
-; GCN-NEXT:    v_add_u32_e32 v6, 0xdc, v0
-; GCN-NEXT:    buffer_store_dword v51, v3, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v52, v4, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v53, v5, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v54, v6, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xe0, v0
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v1, 0xd0, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 0xd4, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xd8, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0xdc, v0
+; GCN-NEXT:    buffer_store_dword v51, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v52, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v53, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v54, v5, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xe0, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 0xe4, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xe8, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0xec, v0
+; GCN-NEXT:    v_add_u32_e32 v6, 0xf0, v0
+; GCN-NEXT:    v_add_u32_e32 v7, 0xf4, v0
+; GCN-NEXT:    v_add_u32_e32 v8, 0xf8, v0
+; GCN-NEXT:    v_add_u32_e32 v9, 0xfc, v0
+; GCN-NEXT:    buffer_store_dword v55, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v56, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v57, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v58, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v59, v6, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v60, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v61, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v62, v9, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(12)
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 1, v10
+; GCN-NEXT:    v_and_b32_e32 v1, 63, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GCN-NEXT:    v_add_u32_e32 v4, 0xe4, v0
-; GCN-NEXT:    v_add_u32_e32 v5, 0xe8, v0
-; GCN-NEXT:    v_add_u32_e32 v6, 0xec, v0
-; GCN-NEXT:    v_add_u32_e32 v7, 0xf0, v0
-; GCN-NEXT:    v_add_u32_e32 v8, 0xf4, v0
-; GCN-NEXT:    v_add_u32_e32 v9, 0xf8, v0
-; GCN-NEXT:    v_add_u32_e32 v10, 0xfc, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, v0, v1
-; GCN-NEXT:    buffer_store_dword v55, v3, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v56, v4, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v57, v5, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v58, v6, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v59, v7, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v60, v8, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v61, v9, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v62, v10, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -563,7 +569,7 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v10
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GCN-NEXT:    s_mov_b32 s33, s6
 ; GCN-NEXT:    s_waitcnt vmcnt(15)
@@ -579,22 +585,9 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v32i64_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v15, v0
 ; GCN-NEXT:    s_add_u32 s4, s32, 0x3fc0
-; GCN-NEXT:    s_mov_b32 s5, 0
 ; GCN-NEXT:    s_mov_b32 s6, s33
 ; GCN-NEXT:    s_and_b32 s33, s4, 0xffffc000
-; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    v_mov_b32_e32 v12, s5
-; GCN-NEXT:    v_mov_b32_e32 v16, v1
-; GCN-NEXT:    v_add_co_u32_e32 v31, vcc, 64, v15
-; GCN-NEXT:    v_mov_b32_e32 v11, s4
-; GCN-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v16, vcc
-; GCN-NEXT:    v_add_co_u32_e32 v48, vcc, v15, v11
-; GCN-NEXT:    v_addc_co_u32_e32 v49, vcc, v16, v12, vcc
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v12, s5
-; GCN-NEXT:    v_mov_b32_e32 v11, s4
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
@@ -610,8 +603,41 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_mov_b32_e32 v15, v0
+; GCN-NEXT:    v_mov_b32_e32 v16, v1
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v[15:16], off
+; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_add_co_u32_e32 v31, vcc, 64, v15
+; GCN-NEXT:    s_movk_i32 s4, 0x80
+; GCN-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v16, vcc
+; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    v_mov_b32_e32 v12, s5
+; GCN-NEXT:    v_mov_b32_e32 v11, s4
+; GCN-NEXT:    v_add_co_u32_e32 v48, vcc, v15, v11
+; GCN-NEXT:    v_addc_co_u32_e32 v49, vcc, v16, v12, vcc
+; GCN-NEXT:    s_movk_i32 s4, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v12, s5
+; GCN-NEXT:    v_mov_b32_e32 v11, s4
 ; GCN-NEXT:    v_add_co_u32_e32 v59, vcc, v15, v11
-; GCN-NEXT:    global_load_dwordx4 v[3:6], v[15:16], off
 ; GCN-NEXT:    global_load_dwordx4 v[7:10], v[15:16], off offset:16
 ; GCN-NEXT:    v_addc_co_u32_e32 v60, vcc, v16, v12, vcc
 ; GCN-NEXT:    global_load_dwordx4 v[11:14], v[15:16], off offset:32
@@ -623,215 +649,198 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    global_load_dwordx4 v[35:38], v[48:49], off
 ; GCN-NEXT:    global_load_dwordx4 v[39:42], v[48:49], off offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[43:46], v[48:49], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[48:49], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[3:6], v[59:60], off
 ; GCN-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GCN-NEXT:    v_add_u32_e32 v0, 0x100, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 16, v0
-; GCN-NEXT:    s_add_u32 s32, s32, 0x10000
-; GCN-NEXT:    s_sub_u32 s32, s32, 0x10000
+; GCN-NEXT:    v_add_u32_e32 v2, 24, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[47:50], v[48:49], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[43:46], v[59:60], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
 ; GCN-NEXT:    global_load_dwordx4 v[51:54], v[59:60], off offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[55:58], v[59:60], off offset:32
 ; GCN-NEXT:    global_load_dwordx4 v[59:62], v[59:60], off offset:48
 ; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 24, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 20, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 28, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 36, v0
-; GCN-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 20, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 44, v0
+; GCN-NEXT:    v_add_u32_e32 v7, 28, v0
+; GCN-NEXT:    v_add_u32_e32 v9, 36, v0
+; GCN-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v10, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v12, v9, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 32, v0
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 40, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 32, v0
 ; GCN-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 48, v0
-; GCN-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 48, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 56, v0
+; GCN-NEXT:    buffer_store_dword v11, v3, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v4, 52, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 60, v0
+; GCN-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 52, v0
-; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 60, v0
-; GCN-NEXT:    buffer_store_dword v18, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x44, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x4c, v0
 ; GCN-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x4c, v0
-; GCN-NEXT:    buffer_store_dword v22, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 64, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x48, v0
 ; GCN-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x48, v0
-; GCN-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x50, v0
+; GCN-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v23, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x58, v0
-; GCN-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x54, v0
-; GCN-NEXT:    buffer_store_dword v24, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x5c, v0
-; GCN-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x64, v0
-; GCN-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6c, v0
-; GCN-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x58, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x60, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 0x54, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0x5c, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0x64, v0
+; GCN-NEXT:    v_add_u32_e32 v6, 0x6c, v0
+; GCN-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v24, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v26, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v28, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v30, v6, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x68, v0
 ; GCN-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x68, v0
-; GCN-NEXT:    buffer_store_dword v29, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x70, v0
+; GCN-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v7, 0x74, v0
+; GCN-NEXT:    v_add_u32_e32 v8, 0x7c, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x78, v0
 ; GCN-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x78, v0
-; GCN-NEXT:    buffer_store_dword v33, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x74, v0
-; GCN-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x7c, v0
-; GCN-NEXT:    buffer_store_dword v34, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v33, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v32, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v34, v8, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x84, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x8c, v0
 ; GCN-NEXT:    buffer_store_dword v36, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x8c, v0
-; GCN-NEXT:    buffer_store_dword v38, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v38, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x80, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x88, v0
 ; GCN-NEXT:    buffer_store_dword v35, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x88, v0
-; GCN-NEXT:    buffer_store_dword v37, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x90, v0
+; GCN-NEXT:    buffer_store_dword v37, v2, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v39, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x98, v0
-; GCN-NEXT:    buffer_store_dword v41, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x94, v0
-; GCN-NEXT:    buffer_store_dword v40, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0x9c, v0
-; GCN-NEXT:    buffer_store_dword v42, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa4, v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v15
-; GCN-NEXT:    v_mov_b32_e32 v9, v16
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v11, v18
-; GCN-NEXT:    v_add_u32_e32 v1, 0xac, v0
-; GCN-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0x98, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xa0, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v10, v17
-; GCN-NEXT:    v_add_u32_e32 v1, 0xa8, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v3, 0x94, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0x9c, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0xa4, v0
+; GCN-NEXT:    v_add_u32_e32 v6, 0xac, v0
+; GCN-NEXT:    buffer_store_dword v41, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v40, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v42, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v44, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v46, v6, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0xa8, v0
+; GCN-NEXT:    buffer_store_dword v43, v1, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xb0, v0
+; GCN-NEXT:    buffer_store_dword v45, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v7, 0xb4, v0
+; GCN-NEXT:    v_add_u32_e32 v8, 0xbc, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0xb8, v0
 ; GCN-NEXT:    buffer_store_dword v47, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb8, v0
-; GCN-NEXT:    buffer_store_dword v49, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xb4, v0
-; GCN-NEXT:    buffer_store_dword v48, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xbc, v0
-; GCN-NEXT:    buffer_store_dword v50, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v49, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v48, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v50, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v2, 0xc8, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xc0, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc8, v0
-; GCN-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xc4, v0
-; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 0xcc, v0
-; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 8, v0
-; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 4, v0
-; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v1, 12, v0
-; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    v_add_u32_e32 v7, 0xec, v0
+; GCN-NEXT:    v_add_u32_e32 v8, 0xf4, v0
+; GCN-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-NEXT:    v_add_u32_e32 v2, 0xc4, v0
+; GCN-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-NEXT:    v_add_u32_e32 v3, 0xcc, v0
+; GCN-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v12, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    v_add_u32_e32 v2, 8, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 0xd0, v0
-; GCN-NEXT:    v_add_u32_e32 v4, 0xd8, v0
-; GCN-NEXT:    buffer_store_dword v51, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v53, v4, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v3, 0xd4, v0
-; GCN-NEXT:    v_add_u32_e32 v6, 0xe0, v0
-; GCN-NEXT:    v_add_u32_e32 v1, 0xf4, v0
-; GCN-NEXT:    v_add_u32_e32 v4, 0xf8, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 12, v0
+; GCN-NEXT:    v_add_u32_e32 v4, 0xd4, v0
 ; GCN-NEXT:    v_add_u32_e32 v5, 0xdc, v0
-; GCN-NEXT:    v_add_u32_e32 v7, 0xe4, v0
-; GCN-NEXT:    v_add_u32_e32 v8, 0xe8, v0
-; GCN-NEXT:    v_add_u32_e32 v10, 0xf0, v0
-; GCN-NEXT:    buffer_store_dword v55, v6, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v57, v8, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v59, v10, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v61, v4, s[0:3], 0 offen
-; GCN-NEXT:    v_add_u32_e32 v9, 0xec, v0
-; GCN-NEXT:    v_add_u32_e32 v4, 0xfc, v0
-; GCN-NEXT:    buffer_store_dword v52, v3, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v6, 0xe4, v0
+; GCN-NEXT:    v_add_u32_e32 v9, 0xfc, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 4, v0
+; GCN-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v13, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:256
+; GCN-NEXT:    v_add_u32_e32 v2, 0xd8, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 0xe0, v0
+; GCN-NEXT:    buffer_store_dword v51, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v1, 0xe8, v0
+; GCN-NEXT:    buffer_store_dword v53, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v55, v3, s[0:3], 0 offen
+; GCN-NEXT:    v_add_u32_e32 v2, 0xf0, v0
+; GCN-NEXT:    v_add_u32_e32 v3, 0xf8, v0
+; GCN-NEXT:    buffer_store_dword v57, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v59, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v61, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v52, v4, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v54, v5, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v56, v7, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v58, v9, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v60, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v62, v4, s[0:3], 0 offen
-; GCN-NEXT:    v_and_b32_e32 v1, 31, v2
+; GCN-NEXT:    buffer_store_dword v56, v6, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v58, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v60, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v62, v9, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 31, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GCN-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_add_u32_e32 v1, 4, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index e9d62068c8a8..abb422ae7363 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -25,15 +25,15 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_mov_b32_e32 v6, s15
 ; GCN-NEXT:    v_mov_b32_e32 v8, s16
 ; GCN-NEXT:    v_mov_b32_e32 v10, s17
+; GCN-NEXT:    v_mov_b32_e32 v12, s18
+; GCN-NEXT:    v_mov_b32_e32 v14, s19
 ; GCN-NEXT:    s_movk_i32 s5, 0x60
 ; GCN-NEXT:    v_add_u32_e32 v2, 8, v0
 ; GCN-NEXT:    v_add_u32_e32 v3, 12, v0
 ; GCN-NEXT:    v_add_u32_e32 v7, 16, v0
 ; GCN-NEXT:    v_add_u32_e32 v9, 20, v0
 ; GCN-NEXT:    v_add_u32_e32 v11, 24, v0
-; GCN-NEXT:    v_mov_b32_e32 v12, s18
 ; GCN-NEXT:    v_add_u32_e32 v13, 28, v0
-; GCN-NEXT:    v_mov_b32_e32 v14, s19
 ; GCN-NEXT:    v_add_u32_e32 v15, 32, v0
 ; GCN-NEXT:    v_mov_b32_e32 v16, s20
 ; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
@@ -71,7 +71,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    buffer_store_dword v28, v27, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v30, v29, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v32, v31, s[0:3], 0 offen
-; GCN-NEXT:    s_movk_i32 s10, 0x70
+; GCN-NEXT:    s_movk_i32 s13, 0x70
 ; GCN-NEXT:    v_add_u32_e32 v35, 0x48, v0
 ; GCN-NEXT:    v_mov_b32_e32 v36, s70
 ; GCN-NEXT:    v_add_u32_e32 v37, 0x4c, v0
@@ -96,19 +96,19 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_add_u32_e32 v26, 0x64, v0
 ; GCN-NEXT:    v_mov_b32_e32 v14, s77
 ; GCN-NEXT:    v_mov_b32_e32 v4, s81
-; GCN-NEXT:    s_movk_i32 s11, 0x90
-; GCN-NEXT:    s_movk_i32 s13, 0xa0
+; GCN-NEXT:    s_movk_i32 s14, 0x90
+; GCN-NEXT:    s_movk_i32 s15, 0xa0
 ; GCN-NEXT:    v_add_u32_e32 v28, 0x68, v0
 ; GCN-NEXT:    v_mov_b32_e32 v16, s78
 ; GCN-NEXT:    v_add_u32_e32 v30, 0x6c, v0
 ; GCN-NEXT:    v_mov_b32_e32 v18, s79
+; GCN-NEXT:    v_add_u32_e32 v32, s13, v0
 ; GCN-NEXT:    v_mov_b32_e32 v20, s80
-; GCN-NEXT:    v_mov_b32_e32 v5, s82
-; GCN-NEXT:    v_mov_b32_e32 v6, s83
-; GCN-NEXT:    v_add_u32_e32 v32, s10, v0
 ; GCN-NEXT:    v_add_u32_e32 v34, 0x74, v0
 ; GCN-NEXT:    v_add_u32_e32 v36, 0x78, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s82
 ; GCN-NEXT:    v_add_u32_e32 v43, 0x7c, v0
+; GCN-NEXT:    v_mov_b32_e32 v6, s83
 ; GCN-NEXT:    v_add_u32_e32 v44, 0x80, v0
 ; GCN-NEXT:    v_mov_b32_e32 v8, s52
 ; GCN-NEXT:    buffer_store_dword v14, v26, s[0:3], 0 offen
@@ -121,12 +121,12 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    buffer_store_dword v8, v44, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v45, 0x84, v0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s53
-; GCN-NEXT:    s_movk_i32 s14, 0xb0
+; GCN-NEXT:    s_movk_i32 s16, 0xb0
 ; GCN-NEXT:    v_add_u32_e32 v46, 0x88, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s54
 ; GCN-NEXT:    v_add_u32_e32 v47, 0x8c, v0
 ; GCN-NEXT:    v_mov_b32_e32 v6, s55
-; GCN-NEXT:    v_add_u32_e32 v48, s11, v0
+; GCN-NEXT:    v_add_u32_e32 v48, s14, v0
 ; GCN-NEXT:    v_mov_b32_e32 v8, s56
 ; GCN-NEXT:    v_add_u32_e32 v49, 0x94, v0
 ; GCN-NEXT:    v_mov_b32_e32 v10, s57
@@ -134,7 +134,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_mov_b32_e32 v12, s58
 ; GCN-NEXT:    v_add_u32_e32 v51, 0x9c, v0
 ; GCN-NEXT:    v_mov_b32_e32 v14, s59
-; GCN-NEXT:    v_add_u32_e32 v52, s13, v0
+; GCN-NEXT:    v_add_u32_e32 v52, s15, v0
 ; GCN-NEXT:    v_mov_b32_e32 v16, s60
 ; GCN-NEXT:    buffer_store_dword v4, v45, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v5, v46, s[0:3], 0 offen
@@ -146,13 +146,13 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    buffer_store_dword v16, v52, s[0:3], 0 offen
 ; GCN-NEXT:    v_add_u32_e32 v53, 0xa4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s61
-; GCN-NEXT:    s_movk_i32 s15, 0xd0
-; GCN-NEXT:    s_movk_i32 s16, 0xe0
+; GCN-NEXT:    s_movk_i32 s17, 0xd0
+; GCN-NEXT:    s_movk_i32 s18, 0xe0
 ; GCN-NEXT:    v_add_u32_e32 v54, 0xa8, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s62
 ; GCN-NEXT:    v_add_u32_e32 v55, 0xac, v0
 ; GCN-NEXT:    v_mov_b32_e32 v6, s63
-; GCN-NEXT:    v_add_u32_e32 v56, s14, v0
+; GCN-NEXT:    v_add_u32_e32 v56, s16, v0
 ; GCN-NEXT:    v_mov_b32_e32 v8, s64
 ; GCN-NEXT:    v_add_u32_e32 v57, 0xb4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v10, s65
@@ -173,12 +173,12 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_add_u32_e32 v61, 0xc4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s37
 ; GCN-NEXT:    s_and_b32 s7, s7, 63
-; GCN-NEXT:    s_movk_i32 s17, 0xf0
+; GCN-NEXT:    s_movk_i32 s19, 0xf0
 ; GCN-NEXT:    v_add_u32_e32 v62, 0xc8, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s38
 ; GCN-NEXT:    v_add_u32_e32 v63, 0xcc, v0
 ; GCN-NEXT:    v_mov_b32_e32 v6, s39
-; GCN-NEXT:    v_add_u32_e32 v64, s15, v0
+; GCN-NEXT:    v_add_u32_e32 v64, s17, v0
 ; GCN-NEXT:    v_mov_b32_e32 v8, s40
 ; GCN-NEXT:    v_add_u32_e32 v65, 0xd4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v10, s41
@@ -186,7 +186,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_mov_b32_e32 v12, s42
 ; GCN-NEXT:    v_add_u32_e32 v67, 0xdc, v0
 ; GCN-NEXT:    v_mov_b32_e32 v14, s43
-; GCN-NEXT:    v_add_u32_e32 v68, s16, v0
+; GCN-NEXT:    v_add_u32_e32 v68, s18, v0
 ; GCN-NEXT:    v_mov_b32_e32 v16, s44
 ; GCN-NEXT:    buffer_store_dword v4, v61, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v5, v62, s[0:3], 0 offen
@@ -202,7 +202,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_mov_b32_e32 v5, s46
 ; GCN-NEXT:    v_add_u32_e32 v71, 0xec, v0
 ; GCN-NEXT:    v_mov_b32_e32 v6, s47
-; GCN-NEXT:    v_add_u32_e32 v72, s17, v0
+; GCN-NEXT:    v_add_u32_e32 v72, s19, v0
 ; GCN-NEXT:    v_mov_b32_e32 v8, s48
 ; GCN-NEXT:    v_add_u32_e32 v73, 0xf4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v10, s49
@@ -217,9 +217,9 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_mov_b32_e32 v4, s12
 ; GCN-NEXT:    s_lshl_b32 s7, s7, 2
 ; GCN-NEXT:    v_add_u32_e32 v75, 0xfc, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s51
+; GCN-NEXT:    v_mov_b32_e32 v14, s51
 ; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:256
-; GCN-NEXT:    buffer_store_dword v5, v75, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v14, v75, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
 ; GCN-NEXT:    v_add_u32_e32 v0, s7, v0
 ; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
@@ -289,78 +289,78 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:256
 ; GCN-NEXT:    s_add_u32 s6, s8, 16
 ; GCN-NEXT:    s_addc_u32 s7, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v65, s9
 ; GCN-NEXT:    v_mov_b32_e32 v67, s7
 ; GCN-NEXT:    v_mov_b32_e32 v66, s6
 ; GCN-NEXT:    s_add_u32 s6, s8, 32
-; GCN-NEXT:    v_mov_b32_e32 v64, s8
 ; GCN-NEXT:    s_addc_u32 s7, s9, 0
+; GCN-NEXT:    v_mov_b32_e32 v65, s9
+; GCN-NEXT:    s_add_u32 s10, s8, 48
+; GCN-NEXT:    v_mov_b32_e32 v64, s8
+; GCN-NEXT:    s_addc_u32 s11, s9, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v[64:65], v[0:3], off
 ; GCN-NEXT:    global_store_dwordx4 v[66:67], v[4:7], off
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_add_u32 s6, s8, 48
-; GCN-NEXT:    s_addc_u32 s7, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN-NEXT:    s_add_u32 s6, s8, 64
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    s_addc_u32 s7, s9, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    s_add_u32 s10, s8, s4
+; GCN-NEXT:    s_addc_u32 s11, s9, 0
+; GCN-NEXT:    s_add_u32 s4, s8, s5
 ; GCN-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; GCN-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off
-; GCN-NEXT:    s_addc_u32 s7, s9, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    s_addc_u32 s5, s9, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_add_u32 s6, s8, s4
-; GCN-NEXT:    s_addc_u32 s7, s9, 0
-; GCN-NEXT:    s_add_u32 s4, s8, s5
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    s_add_u32 s6, s8, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
 ; GCN-NEXT:    global_store_dwordx4 v[0:1], v[16:19], off
 ; GCN-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
+; GCN-NEXT:    s_addc_u32 s7, s9, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s10
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_add_u32 s4, s8, 0x80
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    s_addc_u32 s5, s9, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    s_add_u32 s6, s8, s14
 ; GCN-NEXT:    global_store_dwordx4 v[0:1], v[24:27], off
 ; GCN-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
+; GCN-NEXT:    s_addc_u32 s7, s9, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s11
+; GCN-NEXT:    s_add_u32 s4, s8, s15
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s13
+; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    s_add_u32 s6, s8, s16
 ; GCN-NEXT:    global_store_dwordx4 v[0:1], v[32:35], off
 ; GCN-NEXT:    global_store_dwordx4 v[2:3], v[36:39], off
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
+; GCN-NEXT:    s_addc_u32 s7, s9, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s14
-; GCN-NEXT:    s_addc_u32 s5, s9, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_add_u32 s4, s8, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN-NEXT:    global_store_dwordx4 v[0:1], v[40:43], off
 ; GCN-NEXT:    global_store_dwordx4 v[2:3], v[44:47], off
 ; GCN-NEXT:    s_addc_u32 s5, s9, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s15
+; GCN-NEXT:    s_add_u32 s4, s8, s17
 ; GCN-NEXT:    s_addc_u32 s5, s9, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s16
+; GCN-NEXT:    s_add_u32 s4, s8, s18
 ; GCN-NEXT:    global_store_dwordx4 v[0:1], v[48:51], off
 ; GCN-NEXT:    global_store_dwordx4 v[2:3], v[52:55], off
 ; GCN-NEXT:    s_addc_u32 s5, s9, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_add_u32 s4, s8, s17
+; GCN-NEXT:    s_add_u32 s4, s8, s19
 ; GCN-NEXT:    s_addc_u32 s5, s9, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5

diff  --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 2986cb48a86e..931bb12f29eb 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -744,13 +744,13 @@ entry:
 
 ; GCN-LABEL: {{^}}tail_call_byval_align16:
 ; GCN-NOT: s32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12
 
 ; GCN: s_getpc_b64
 
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index bfd9a9bf4e96..2dc47ca94aa9 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -624,11 +624,10 @@ define void @too_many_args_use_workitem_id_x_byval(
 
 
 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
-; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
-
 ; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
-
 ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
+
 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
 
 ; FIXME: Why this reload?
@@ -670,9 +669,8 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
 
 ; FIXED-ABI-NOT: v31
 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
-; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
-
 ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
 ; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index bacbfcb8f500..fa34e42f22b4 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -1364,11 +1364,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX9-NEXT:    buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
-; GFX9-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
 ; GFX9-NEXT:    buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
 ; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
+; GFX9-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index d54058eec30c..1908015f4770 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -312,6 +312,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
 
 ; GCN: flat_store_dwordx4
 
@@ -325,7 +326,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
-; SI: v_cvt_f32_f16_e32
 
 ; VI: v_cvt_f32_f16_e32
 ; VI: v_cvt_f32_f16_sdwa

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 8b7557d5deb8..193f99731331 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -160,16 +160,16 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3
+; GCN: s_mov_b32 s34, s32
 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
-
-; GCN: s_mov_b32 s34, s32
-; GCN-NEXT: v_mov_b32_e32 v32, 0
-
-; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
 ; GCN-NEXT: s_add_u32 s32, s32, 0x30000
 
+; GCN: v_mov_b32_e32 v33, 0
+
+; GCN: buffer_store_dword v33, off, s[0:3], s33 offset:1024
+
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
 

diff  --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index a4d08ec980f4..70c5655fe811 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -7,11 +7,11 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
 ; CIVI-LABEL: local_store_i56:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIVI-NEXT:        v_lshrrev_b32_e32 v3, 16, v2
 ; CIVI-NEXT:        s_mov_b32 m0, -1
-; CIVI-NEXT:        ds_write_b8 v0, v3 offset:6
-; CIVI-NEXT:        ds_write_b16 v0, v2 offset:4
 ; CIVI-NEXT:        ds_write_b32 v0, v1
+; CIVI-NEXT:        v_lshrrev_b32_e32 v1, 16, v2
+; CIVI-NEXT:        ds_write_b16 v0, v2 offset:4
+; CIVI-NEXT:        ds_write_b8 v0, v1 offset:6
 ; CIVI-NEXT:        s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:        s_setpc_b64 s[30:31]
 ;


        


More information about the llvm-commits mailing list