[llvm] 69153d6 - AMDGPU: Use GlobalPriority for largest register tuples

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 15 08:45:13 PDT 2022


Author: Matt Arsenault
Date: 2022-09-15T11:45:02-04:00
New Revision: 69153d6c0a3f9110bc455b1cca28a5a71e2ac933

URL: https://github.com/llvm/llvm-project/commit/69153d6c0a3f9110bc455b1cca28a5a71e2ac933
DIFF: https://github.com/llvm/llvm-project/commit/69153d6c0a3f9110bc455b1cca28a5a71e2ac933.diff

LOG: AMDGPU: Use GlobalPriority for largest register tuples

Only do this for 16 and 32 register tuples, although we might want to
extend to 8 tuples.

It's incredibly expensive to spill these, and doing so majorly
interferes with the ability to allocate anything else in the function.

The lit tests show mostly sizeable improvements with a handful of tiny
regressions with large vectors.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIRegisterInfo.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
    llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
    llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index e5095e971add1..99c8f86e9081d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -792,8 +792,11 @@ defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
 defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
 defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
 defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
+
+let GlobalPriority = true in {
 defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
 defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
+}
 
 def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                                  (add VGPR_32, LDS_DIRECT_CLASS)> {
@@ -833,8 +836,11 @@ defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
 defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
 defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
 defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
+
+let GlobalPriority = true in {
 defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
 defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
+}
 
 multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
   let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
@@ -854,8 +860,11 @@ defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
 defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
 defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
 defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
+
+let GlobalPriority = true in {
 defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
 defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
+}
 
 } // End GeneratePressureSet = 0
 
@@ -910,8 +919,11 @@ defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>;
 defm AV_192 : AVRegClass<6, VReg_192.RegTypes, (add VGPR_192), (add AGPR_192)>;
 defm AV_224 : AVRegClass<7, VReg_224.RegTypes, (add VGPR_224), (add AGPR_224)>;
 defm AV_256 : AVRegClass<8, VReg_256.RegTypes, (add VGPR_256), (add AGPR_256)>;
+
+let GlobalPriority = true in {
 defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>;
 defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
+}
 
 //===----------------------------------------------------------------------===//
 //  Register operands

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 36bb7ca7af0e9..cfa56f05fac7a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -11,237 +11,166 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[19:22], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[23:26], v[0:1], off offset:80
-; GCN-NEXT:    global_load_dwordx4 v[27:30], v[0:1], off offset:96
-; GCN-NEXT:    global_load_dwordx4 v[31:34], v[0:1], off offset:112
-; GCN-NEXT:    global_load_dwordx4 v[35:38], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[39:42], v[0:1], off offset:144
-; GCN-NEXT:    global_load_dwordx4 v[43:46], v[0:1], off offset:160
-; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:176
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[60:63], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:160
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:176
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:192
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:208
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:192
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[51:54], v[0:1], off offset:208
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[55:58], v[0:1], off offset:224
-; GCN-NEXT:    global_load_dwordx4 v[59:62], v[0:1], off offset:240
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
-; GCN-NEXT:    v_and_b32_e32 v0, 63, v2
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:224
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:240
 ; GCN-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    v_add_u32_e32 v1, 0x100, v1
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:432
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:436
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:440
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:444
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:460
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    v_and_b32_e32 v0, 63, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v15
-; GCN-NEXT:    v_mov_b32_e32 v13, v16
-; GCN-NEXT:    v_mov_b32_e32 v14, v17
-; GCN-NEXT:    v_mov_b32_e32 v15, v18
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v4, v7
-; GCN-NEXT:    v_mov_b32_e32 v5, v8
-; GCN-NEXT:    v_mov_b32_e32 v6, v9
-; GCN-NEXT:    v_mov_b32_e32 v7, v10
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_mov_b32_e32 v16, v20
+; GCN-NEXT:    v_mov_b32_e32 v17, v21
+; GCN-NEXT:    v_mov_b32_e32 v18, v22
+; GCN-NEXT:    v_mov_b32_e32 v19, v23
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -257,241 +186,170 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[19:22], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[23:26], v[0:1], off offset:80
-; GCN-NEXT:    global_load_dwordx4 v[27:30], v[0:1], off offset:96
-; GCN-NEXT:    global_load_dwordx4 v[31:34], v[0:1], off offset:112
-; GCN-NEXT:    global_load_dwordx4 v[35:38], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[39:42], v[0:1], off offset:144
-; GCN-NEXT:    global_load_dwordx4 v[43:46], v[0:1], off offset:160
-; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:176
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[60:63], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:160
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:176
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:192
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:208
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:192
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[51:54], v[0:1], off offset:208
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[55:58], v[0:1], off offset:224
-; GCN-NEXT:    global_load_dwordx4 v[59:62], v[0:1], off offset:240
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
-; GCN-NEXT:    v_bfe_u32 v0, v2, 1, 6
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:224
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:240
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:432
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:436
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:440
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:444
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:460
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    v_bfe_u32 v0, v6, 1, 6
+; GCN-NEXT:    v_lshrrev_b32_e64 v5, 6, s33
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v2
+; GCN-NEXT:    v_add_u32_e32 v5, 0x100, v5
+; GCN-NEXT:    v_add_u32_e32 v0, v5, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v6
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v15
-; GCN-NEXT:    v_mov_b32_e32 v13, v16
-; GCN-NEXT:    v_mov_b32_e32 v14, v17
-; GCN-NEXT:    v_mov_b32_e32 v15, v18
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v4, v7
-; GCN-NEXT:    v_mov_b32_e32 v5, v8
-; GCN-NEXT:    v_mov_b32_e32 v6, v9
-; GCN-NEXT:    v_mov_b32_e32 v7, v10
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:508
-; GCN-NEXT:    v_lshrrev_b32_e64 v7, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v7, 0x100, v7
-; GCN-NEXT:    v_add_u32_e32 v0, v7, v0
+; GCN-NEXT:    v_mov_b32_e32 v16, v20
+; GCN-NEXT:    v_mov_b32_e32 v17, v21
+; GCN-NEXT:    v_mov_b32_e32 v18, v22
+; GCN-NEXT:    v_mov_b32_e32 v19, v23
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s4
-; GCN-NEXT:    s_waitcnt vmcnt(15)
+; GCN-NEXT:    s_waitcnt vmcnt(16)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -507,150 +365,111 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:80
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:96
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:112
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[0:1], off offset:144
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:160
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[60:63], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:160
 ; GCN-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:176
-; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:192
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:192
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:208
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:208
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[0:1], off offset:224
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:224
 ; GCN-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:240
-; GCN-NEXT:    v_and_b32_e32 v0, 31, v2
-; GCN-NEXT:    v_lshrrev_b32_e64 v2, 6, s33
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_add_u32_e32 v2, 0x100, v2
-; GCN-NEXT:    v_add_u32_e32 v1, v2, v0
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:428
 ; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:432
 ; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:436
 ; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:440
 ; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v3, v16
-; GCN-NEXT:    v_mov_b32_e32 v4, v17
-; GCN-NEXT:    v_mov_b32_e32 v5, v18
-; GCN-NEXT:    v_mov_b32_e32 v6, v19
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:460
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:460
 ; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
@@ -667,37 +486,46 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT:    v_and_b32_e32 v0, 31, v6
+; GCN-NEXT:    v_lshrrev_b32_e64 v5, 6, s33
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_add_u32_e32 v5, 0x100, v5
+; GCN-NEXT:    v_add_u32_e32 v1, v5, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v4, v20
-; GCN-NEXT:    v_mov_b32_e32 v5, v21
-; GCN-NEXT:    v_mov_b32_e32 v6, v22
-; GCN-NEXT:    v_mov_b32_e32 v7, v23
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:492
+; GCN-NEXT:    v_mov_b32_e32 v16, v20
+; GCN-NEXT:    v_mov_b32_e32 v17, v21
+; GCN-NEXT:    v_mov_b32_e32 v18, v22
+; GCN-NEXT:    v_mov_b32_e32 v19, v23
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
 ; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:496
 ; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:500
 ; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:504
 ; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 2e97f4d74e934..9b0943ea753bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -339,118 +339,115 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GFX10-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
-; GFX10-NEXT:    v_add_nc_u32_e32 v19, 1, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v19
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v3, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v9, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v8, v10, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 2, v19
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v9, v11, s4
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 2, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v8, vcc_lo
-; GFX10-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 3, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v4, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v15, v5, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v12, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v13, v7, vcc_lo
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v2
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 4, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 4, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 5, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 5, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v19
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v16, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v16, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 7, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v18, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, v17, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, v18, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 7, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v14, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v15, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_b128 v[3:6], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off offset:16
-; GFX11-NEXT:    global_load_b128 v[11:14], v[0:1], off offset:32
-; GFX11-NEXT:    global_load_b128 v[15:18], v[0:1], off offset:48
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[12:15], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:32
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v20, v3, v5 :: v_dual_cndmask_b32 v21, v4, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v17, v13, v15 :: v_dual_cndmask_b32 v16, v12, v14
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, v12, v14, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, v13, v15, s0
+; GFX11-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v21, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v20, v7 :: v_dual_add_nc_u32 v19, 1, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v17, v5 :: v_dual_cndmask_b32 v0, v16, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v2
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v9 :: v_dual_cndmask_b32 v1, v1, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v18, v4, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v19, v5, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 4, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v19
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 4, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v11 :: v_dual_cndmask_b32 v1, v1, v12
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v19
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v13 :: v_dual_cndmask_b32 v1, v1, v14
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 5, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 4, v19
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 6, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v15 :: v_dual_cndmask_b32 v1, v1, v16
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 5, v19
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v17 :: v_dual_cndmask_b32 v1, v1, v18
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 6, v19
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v16, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 7, v19
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v3, v17, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v4, v18, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 7, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v4, v14, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v15, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr
   %element = extractelement <4 x i128> %vector, i32 %idx

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
index b0bb5c516c325..73dc2961df8f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -21,10 +21,10 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v64, s[0:1] offset:144
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v64, s[0:1] offset:160
 ; GCN-NEXT:    global_load_dwordx4 v[12:15], v64, s[0:1] offset:176
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v64, s[0:1] offset:208
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v64, s[0:1] offset:224
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v64, s[0:1] offset:240
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v64, s[0:1] offset:192
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v64, s[0:1] offset:208
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v64, s[0:1] offset:224
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v64, s[0:1] offset:240
 ; GCN-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0x3e7
 ; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
@@ -34,11 +34,11 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GCN-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NEXT:    global_store_dwordx4 v64, v[12:15], s[2:3] offset:176
 ; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[2:3] offset:192
+; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[2:3] offset:192
 ; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[2:3] offset:208
+; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[2:3] offset:208
 ; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[2:3] offset:224
+; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[2:3] offset:224
 ; GCN-NEXT:    global_store_dwordx4 v64, v[32:35], s[2:3]
 ; GCN-NEXT:    global_store_dwordx4 v64, v[36:39], s[2:3] offset:16
 ; GCN-NEXT:    global_store_dwordx4 v64, v[40:43], s[2:3] offset:32
@@ -48,7 +48,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GCN-NEXT:    global_store_dwordx4 v64, v[56:59], s[2:3] offset:96
 ; GCN-NEXT:    global_store_dwordx4 v64, v[60:63], s[2:3] offset:112
 ; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
+; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[2:3] offset:240
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_insert_v64i32_37:
@@ -67,24 +67,24 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GFX10-NEXT:    global_load_dwordx4 v[56:59], v64, s[0:1] offset:96
 ; GFX10-NEXT:    global_load_dwordx4 v[60:63], v64, s[0:1] offset:112
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v64, s[0:1] offset:144
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v64, s[0:1] offset:160
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v64, s[0:1] offset:160
 ; GFX10-NEXT:    global_load_dwordx4 v[12:15], v64, s[0:1] offset:176
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v64, s[0:1] offset:208
-; GFX10-NEXT:    global_load_dwordx4 v[24:27], v64, s[0:1] offset:224
-; GFX10-NEXT:    global_load_dwordx4 v[28:31], v64, s[0:1] offset:240
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v64, s[0:1] offset:192
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v64, s[0:1] offset:208
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v64, s[0:1] offset:224
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v64, s[0:1] offset:240
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0x3e7
 ; GFX10-NEXT:    global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
 ; GFX10-NEXT:    global_store_dwordx4 v64, v[4:7], s[2:3] offset:144
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    global_store_dwordx4 v64, v[8:11], s[2:3] offset:160
+; GFX10-NEXT:    global_store_dwordx4 v64, v[20:23], s[2:3] offset:160
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
 ; GFX10-NEXT:    global_store_dwordx4 v64, v[12:15], s[2:3] offset:176
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    global_store_dwordx4 v64, v[16:19], s[2:3] offset:192
+; GFX10-NEXT:    global_store_dwordx4 v64, v[28:31], s[2:3] offset:192
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    global_store_dwordx4 v64, v[20:23], s[2:3] offset:208
+; GFX10-NEXT:    global_store_dwordx4 v64, v[24:27], s[2:3] offset:208
 ; GFX10-NEXT:    global_store_dwordx4 v64, v[32:35], s[2:3]
 ; GFX10-NEXT:    global_store_dwordx4 v64, v[36:39], s[2:3] offset:16
 ; GFX10-NEXT:    global_store_dwordx4 v64, v[40:43], s[2:3] offset:32
@@ -94,9 +94,9 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GFX10-NEXT:    global_store_dwordx4 v64, v[56:59], s[2:3] offset:96
 ; GFX10-NEXT:    global_store_dwordx4 v64, v[60:63], s[2:3] offset:112
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    global_store_dwordx4 v64, v[24:27], s[2:3] offset:224
+; GFX10-NEXT:    global_store_dwordx4 v64, v[8:11], s[2:3] offset:224
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
+; GFX10-NEXT:    global_store_dwordx4 v64, v[16:19], s[2:3] offset:240
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_insert_v64i32_37:
@@ -117,24 +117,24 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, 0x3e7
 ; GFX11-NEXT:    s_clause 0x6
-; GFX11-NEXT:    global_load_b128 v[0:3], v64, s[0:1] offset:128
-; GFX11-NEXT:    global_load_b128 v[8:11], v64, s[0:1] offset:160
-; GFX11-NEXT:    global_load_b128 v[12:15], v64, s[0:1] offset:176
-; GFX11-NEXT:    global_load_b128 v[16:19], v64, s[0:1] offset:192
-; GFX11-NEXT:    global_load_b128 v[20:23], v64, s[0:1] offset:208
-; GFX11-NEXT:    global_load_b128 v[24:27], v64, s[0:1] offset:224
-; GFX11-NEXT:    global_load_b128 v[28:31], v64, s[0:1] offset:240
+; GFX11-NEXT:    global_load_b128 v[28:31], v64, s[0:1] offset:128
+; GFX11-NEXT:    global_load_b128 v[24:27], v64, s[0:1] offset:160
+; GFX11-NEXT:    global_load_b128 v[20:23], v64, s[0:1] offset:176
+; GFX11-NEXT:    global_load_b128 v[0:3], v64, s[0:1] offset:192
+; GFX11-NEXT:    global_load_b128 v[16:19], v64, s[0:1] offset:208
+; GFX11-NEXT:    global_load_b128 v[8:11], v64, s[0:1] offset:224
+; GFX11-NEXT:    global_load_b128 v[12:15], v64, s[0:1] offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v64, v[0:3], s[2:3] offset:128
+; GFX11-NEXT:    global_store_b128 v64, v[28:31], s[2:3] offset:128
 ; GFX11-NEXT:    global_store_b128 v64, v[4:7], s[2:3] offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    global_store_b128 v64, v[8:11], s[2:3] offset:160
+; GFX11-NEXT:    global_store_b128 v64, v[24:27], s[2:3] offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    global_store_b128 v64, v[12:15], s[2:3] offset:176
+; GFX11-NEXT:    global_store_b128 v64, v[20:23], s[2:3] offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    global_store_b128 v64, v[16:19], s[2:3] offset:192
+; GFX11-NEXT:    global_store_b128 v64, v[0:3], s[2:3] offset:192
 ; GFX11-NEXT:    global_store_b128 v64, v[32:35], s[2:3]
 ; GFX11-NEXT:    global_store_b128 v64, v[36:39], s[2:3] offset:16
 ; GFX11-NEXT:    global_store_b128 v64, v[40:43], s[2:3] offset:32
@@ -144,11 +144,11 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GFX11-NEXT:    global_store_b128 v64, v[56:59], s[2:3] offset:96
 ; GFX11-NEXT:    global_store_b128 v64, v[60:63], s[2:3] offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    global_store_b128 v64, v[20:23], s[2:3] offset:208
+; GFX11-NEXT:    global_store_b128 v64, v[16:19], s[2:3] offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    global_store_b128 v64, v[24:27], s[2:3] offset:224
+; GFX11-NEXT:    global_store_b128 v64, v[8:11], s[2:3] offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b128 v64, v[28:31], s[2:3] offset:240
+; GFX11-NEXT:    global_store_b128 v64, v[12:15], s[2:3] offset:240
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 4f8ba532db04a..41683b290d7ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -819,22 +819,22 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-NEXT:    s_mov_b32 s9, 0x40080000
 ; GPRIDX-NEXT:    s_mov_b32 s8, s18
 ; GPRIDX-NEXT:    s_mov_b64 s[6:7], 2.0
-; GPRIDX-NEXT:    v_mov_b32_e32 v3, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v4, s5
-; GPRIDX-NEXT:    v_mov_b32_e32 v5, s6
-; GPRIDX-NEXT:    v_mov_b32_e32 v6, s7
-; GPRIDX-NEXT:    v_mov_b32_e32 v7, s8
-; GPRIDX-NEXT:    v_mov_b32_e32 v8, s9
-; GPRIDX-NEXT:    v_mov_b32_e32 v9, s10
-; GPRIDX-NEXT:    v_mov_b32_e32 v10, s11
-; GPRIDX-NEXT:    v_mov_b32_e32 v11, s12
-; GPRIDX-NEXT:    v_mov_b32_e32 v12, s13
-; GPRIDX-NEXT:    v_mov_b32_e32 v13, s14
-; GPRIDX-NEXT:    v_mov_b32_e32 v14, s15
-; GPRIDX-NEXT:    v_mov_b32_e32 v15, s16
-; GPRIDX-NEXT:    v_mov_b32_e32 v16, s17
-; GPRIDX-NEXT:    v_mov_b32_e32 v17, s18
-; GPRIDX-NEXT:    v_mov_b32_e32 v18, s19
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s12
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s13
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s14
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s15
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s16
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, s17
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, s18
+; GPRIDX-NEXT:    v_mov_b32_e32 v19, s19
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[16:17], 0, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[4:5], 2, v2
@@ -843,29 +843,29 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[10:11], 5, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[12:13], 6, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[14:15], 7, v2
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[16:17]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s[16:17]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v6, v1, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v0, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v11, v11, v0, s[8:9]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v13, v0, s[10:11]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v15, v15, v0, s[12:13]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s[14:15]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v10, v1, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v12, v12, v1, s[8:9]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v14, v14, v1, s[10:11]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v16, v16, v1, s[12:13]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v18, v18, v1, s[14:15]
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v4, v0, s[16:17]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v6, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v5, v1, s[16:17]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v7, v1, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v8, v0, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v10, v0, s[6:7]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v12, v0, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v12, v14, v0, s[10:11]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v14, v16, v0, s[12:13]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v16, v18, v0, s[14:15]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v9, v1, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v11, v1, s[6:7]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v11, v13, v1, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v15, v1, s[10:11]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v15, v17, v1, s[12:13]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v17, v19, v1, s[14:15]
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[14:17], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1025,23 +1025,23 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
 ; GPRIDX-NEXT:    s_mov_b32 s10, s12
 ; GPRIDX-NEXT:    s_mov_b32 s12, s14
 ; GPRIDX-NEXT:    s_mov_b32 s14, s16
-; GPRIDX-NEXT:    v_mov_b32_e32 v16, s15
-; GPRIDX-NEXT:    v_mov_b32_e32 v15, s14
-; GPRIDX-NEXT:    v_mov_b32_e32 v14, s13
-; GPRIDX-NEXT:    v_mov_b32_e32 v13, s12
-; GPRIDX-NEXT:    v_mov_b32_e32 v12, s11
-; GPRIDX-NEXT:    v_mov_b32_e32 v11, s10
-; GPRIDX-NEXT:    v_mov_b32_e32 v10, s9
-; GPRIDX-NEXT:    v_mov_b32_e32 v9, s8
-; GPRIDX-NEXT:    v_mov_b32_e32 v8, s7
-; GPRIDX-NEXT:    v_mov_b32_e32 v7, s6
-; GPRIDX-NEXT:    v_mov_b32_e32 v6, s5
-; GPRIDX-NEXT:    v_mov_b32_e32 v5, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v4, s3
-; GPRIDX-NEXT:    v_mov_b32_e32 v3, s2
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s1
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s0
-; GPRIDX-NEXT:    v_mov_b32_e32 v17, s18
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, s15
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s14
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s13
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s12
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s18
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
@@ -1050,30 +1050,30 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s19
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s[12:13]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v3, v17, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[12:13]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v5, v17, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v17, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v17, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v11, v11, v17, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[8:9]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[10:11]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v10, v0, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v12, v12, v0, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v14, v14, v0, s[8:9]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v16, v16, v0, s[10:11]
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[1:4], off
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[12:13]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v4, v1, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v6, v1, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v8, v1, s[2:3]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v10, v1, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v12, v1, s[6:7]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v12, v14, v1, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v14, v16, v1, s[10:11]
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s19
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v1, v3, v16, s[12:13]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v3, v5, v16, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v5, v7, v16, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v9, v16, s[2:3]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v11, v16, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v11, v13, v16, s[6:7]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v15, v16, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v15, v17, v16, s[10:11]
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[5:8], off
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[13:16], off
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;
@@ -1447,22 +1447,22 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
 ; GPRIDX-NEXT:    s_mov_b32 s10, s12
 ; GPRIDX-NEXT:    s_mov_b32 s12, s14
 ; GPRIDX-NEXT:    s_mov_b32 s14, s16
-; GPRIDX-NEXT:    v_mov_b32_e32 v18, s15
-; GPRIDX-NEXT:    v_mov_b32_e32 v17, s14
-; GPRIDX-NEXT:    v_mov_b32_e32 v16, s13
-; GPRIDX-NEXT:    v_mov_b32_e32 v15, s12
-; GPRIDX-NEXT:    v_mov_b32_e32 v14, s11
-; GPRIDX-NEXT:    v_mov_b32_e32 v13, s10
-; GPRIDX-NEXT:    v_mov_b32_e32 v12, s9
-; GPRIDX-NEXT:    v_mov_b32_e32 v11, s8
-; GPRIDX-NEXT:    v_mov_b32_e32 v10, s7
-; GPRIDX-NEXT:    v_mov_b32_e32 v9, s6
-; GPRIDX-NEXT:    v_mov_b32_e32 v8, s5
-; GPRIDX-NEXT:    v_mov_b32_e32 v7, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v6, s3
-; GPRIDX-NEXT:    v_mov_b32_e32 v5, s2
-; GPRIDX-NEXT:    v_mov_b32_e32 v4, s1
-; GPRIDX-NEXT:    v_mov_b32_e32 v3, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v19, s15
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, s14
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, s13
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s12
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s0
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v2
@@ -1471,29 +1471,29 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v2
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v2
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[12:13]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v4, v4, v1, s[12:13]
-; GPRIDX-NEXT:    v_cndmask_b32_e32 v6, v6, v1, vcc
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v9, v0, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v11, v11, v0, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v13, v0, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v15, v15, v0, s[8:9]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s[10:11]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[0:1]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v10, v1, s[2:3]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v12, v12, v1, s[4:5]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v14, v14, v1, s[6:7]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v16, v16, v1, s[8:9]
-; GPRIDX-NEXT:    v_cndmask_b32_e64 v18, v18, v1, s[10:11]
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v2, v4, v0, s[12:13]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v4, v6, v0, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v3, v5, v1, s[12:13]
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v5, v7, v1, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v6, v8, v0, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v8, v10, v0, s[2:3]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v10, v12, v0, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v12, v14, v0, s[6:7]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v14, v16, v0, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v16, v18, v0, s[10:11]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v7, v9, v1, s[0:1]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v9, v11, v1, s[2:3]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v11, v13, v1, s[4:5]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v13, v15, v1, s[6:7]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v15, v17, v1, s[8:9]
+; GPRIDX-NEXT:    v_cndmask_b32_e64 v17, v19, v1, s[10:11]
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
-; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
+; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[14:17], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index cd7b5018e9bd1..1c955ee622d2c 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -878,34 +878,33 @@ entry:
 define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double15_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0xa4
+; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0xa4
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x114
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x104
+; GCN-NEXT:    s_load_dwordx4 s[20:23], s[0:1], 0x104
 ; GCN-NEXT:    s_load_dwordx8 s[24:31], s[0:1], 0xe4
 ; GCN-NEXT:    v_mov_b32_e32 v32, 0x3ff00000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v28, s2
-; GCN-NEXT:    v_mov_b32_e32 v24, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0x124
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NEXT:    v_mov_b32_e32 v28, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s2, s4, 1
-; GCN-NEXT:    v_mov_b32_e32 v4, s12
-; GCN-NEXT:    v_mov_b32_e32 v5, s13
-; GCN-NEXT:    v_mov_b32_e32 v6, s14
-; GCN-NEXT:    v_mov_b32_e32 v7, s15
-; GCN-NEXT:    v_mov_b32_e32 v8, s16
-; GCN-NEXT:    v_mov_b32_e32 v9, s17
-; GCN-NEXT:    v_mov_b32_e32 v10, s18
-; GCN-NEXT:    v_mov_b32_e32 v11, s19
-; GCN-NEXT:    v_mov_b32_e32 v12, s20
-; GCN-NEXT:    v_mov_b32_e32 v13, s21
-; GCN-NEXT:    v_mov_b32_e32 v14, s22
-; GCN-NEXT:    v_mov_b32_e32 v15, s23
+; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NEXT:    v_mov_b32_e32 v6, s10
+; GCN-NEXT:    v_mov_b32_e32 v7, s11
+; GCN-NEXT:    v_mov_b32_e32 v8, s12
+; GCN-NEXT:    v_mov_b32_e32 v9, s13
+; GCN-NEXT:    v_mov_b32_e32 v10, s14
+; GCN-NEXT:    v_mov_b32_e32 v11, s15
+; GCN-NEXT:    v_mov_b32_e32 v12, s16
+; GCN-NEXT:    v_mov_b32_e32 v13, s17
+; GCN-NEXT:    v_mov_b32_e32 v14, s18
+; GCN-NEXT:    v_mov_b32_e32 v15, s19
 ; GCN-NEXT:    v_mov_b32_e32 v16, s24
 ; GCN-NEXT:    v_mov_b32_e32 v17, s25
 ; GCN-NEXT:    v_mov_b32_e32 v18, s26
@@ -914,9 +913,10 @@ define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15
 ; GCN-NEXT:    v_mov_b32_e32 v21, s29
 ; GCN-NEXT:    v_mov_b32_e32 v22, s30
 ; GCN-NEXT:    v_mov_b32_e32 v23, s31
-; GCN-NEXT:    v_mov_b32_e32 v25, s5
-; GCN-NEXT:    v_mov_b32_e32 v26, s6
-; GCN-NEXT:    v_mov_b32_e32 v27, s7
+; GCN-NEXT:    v_mov_b32_e32 v24, s20
+; GCN-NEXT:    v_mov_b32_e32 v25, s21
+; GCN-NEXT:    v_mov_b32_e32 v26, s22
+; GCN-NEXT:    v_mov_b32_e32 v27, s23
 ; GCN-NEXT:    v_mov_b32_e32 v29, s3
 ; GCN-NEXT:    s_mov_b32 m0, s2
 ; GCN-NEXT:    v_movreld_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index c30a35996994b..6248bef24b3a8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -39,13 +39,13 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no
 ; GCN-NEXT:    ds_read_b128 a[32:35], v1 offset:8192
 ; GCN-NEXT:    ds_read_b128 a[56:59], v1 offset:8288
 ; GCN-NEXT:    v_add_u32_e32 v4, 0x6000, v1
-; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:49264
-; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:49248
-; GCN-NEXT:    ds_read_b128 a[76:79], v1 offset:49232
-; GCN-NEXT:    ds_read_b128 a[72:75], v1 offset:49216
-; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:49200
-; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:49184
-; GCN-NEXT:    ds_read_b128 a[116:119], v4 offset:57456
+; GCN-NEXT:    ds_read_b128 a[116:119], v1 offset:24688
+; GCN-NEXT:    ds_read_b128 a[112:115], v1 offset:24672
+; GCN-NEXT:    ds_read_b128 a[108:111], v1 offset:24656
+; GCN-NEXT:    ds_read_b128 a[104:107], v1 offset:24640
+; GCN-NEXT:    ds_read_b128 a[100:103], v1 offset:24624
+; GCN-NEXT:    ds_read_b128 a[96:99], v1 offset:24608
+; GCN-NEXT:    ds_read_b128 a[92:95], v1 offset:24592
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:96
@@ -60,28 +60,28 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:24688
-; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:24672
-; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:24656
-; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:24640
-; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:24624
-; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:24608
-; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:24592
-; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:24576
-; GCN-NEXT:    ds_read_b128 a[112:115], v4 offset:57440
-; GCN-NEXT:    ds_read_b128 a[108:111], v4 offset:57424
-; GCN-NEXT:    ds_read_b128 a[104:107], v4 offset:57408
-; GCN-NEXT:    ds_read_b128 a[88:91], v4 offset:57344
-; GCN-NEXT:    ds_read_b128 a[92:95], v4 offset:57360
-; GCN-NEXT:    ds_read_b128 a[96:99], v4 offset:57376
+; GCN-NEXT:    ds_read_b128 a[88:91], v1 offset:24576
+; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:49264
+; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:49248
+; GCN-NEXT:    ds_read_b128 a[76:79], v1 offset:49232
+; GCN-NEXT:    ds_read_b128 a[72:75], v1 offset:49216
+; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:49200
+; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:49184
+; GCN-NEXT:    ds_read_b128 a[28:31], v4 offset:57456
+; GCN-NEXT:    ds_read_b128 a[24:27], v4 offset:57440
+; GCN-NEXT:    ds_read_b128 a[20:23], v4 offset:57424
+; GCN-NEXT:    ds_read_b128 a[16:19], v4 offset:57408
+; GCN-NEXT:    ds_read_b128 a[0:3], v4 offset:57344
+; GCN-NEXT:    ds_read_b128 a[4:7], v4 offset:57360
+; GCN-NEXT:    ds_read_b128 a[8:11], v4 offset:57376
 ; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
 ; GCN-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
 ; GCN-NEXT:    ds_read_b128 a[60:63], v1 offset:49168
 ; GCN-NEXT:    ds_read_b128 a[56:59], v1 offset:49152
-; GCN-NEXT:    ds_read_b128 a[100:103], v4 offset:57392
+; GCN-NEXT:    ds_read_b128 a[12:15], v4 offset:57392
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[88:119], v2, v3, a[88:119]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
 ; GCN-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
 ; GCN-NEXT:    ds_write_b128 v0, a[52:55] offset:8272
 ; GCN-NEXT:    ds_write_b128 v0, a[40:43] offset:8224
@@ -91,15 +91,15 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[56:87], v2, v3, a[56:87]
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    ds_write_b128 v0, a[112:115] offset:32864
-; GCN-NEXT:    ds_write_b128 v0, a[116:119] offset:32880
-; GCN-NEXT:    ds_write_b128 v0, a[104:107] offset:32832
-; GCN-NEXT:    ds_write_b128 v0, a[108:111] offset:32848
-; GCN-NEXT:    ds_write_b128 v0, a[96:99] offset:32800
-; GCN-NEXT:    ds_write_b128 v0, a[100:103] offset:32816
-; GCN-NEXT:    ds_write_b128 v0, a[88:91] offset:32768
-; GCN-NEXT:    ds_write_b128 v0, a[92:95] offset:32784
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
+; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
+; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:32832
+; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:32848
+; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32800
+; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:32816
+; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:32768
+; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:32784
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[88:119], v2, v3, a[88:119]
 ; GCN-NEXT:    ds_write_b128 v0, a[80:83] offset:24672
 ; GCN-NEXT:    ds_write_b128 v0, a[84:87] offset:24688
 ; GCN-NEXT:    ds_write_b128 v0, a[72:75] offset:24640
@@ -110,14 +110,14 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no
 ; GCN-NEXT:    ds_write_b128 v0, a[60:63] offset:24592
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
-; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
-; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:16448
-; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:16464
-; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:16416
-; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:16432
-; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:16384
-; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
+; GCN-NEXT:    ds_write_b128 v0, a[112:115] offset:16480
+; GCN-NEXT:    ds_write_b128 v0, a[116:119] offset:16496
+; GCN-NEXT:    ds_write_b128 v0, a[104:107] offset:16448
+; GCN-NEXT:    ds_write_b128 v0, a[108:111] offset:16464
+; GCN-NEXT:    ds_write_b128 v0, a[96:99] offset:16416
+; GCN-NEXT:    ds_write_b128 v0, a[100:103] offset:16432
+; GCN-NEXT:    ds_write_b128 v0, a[88:91] offset:16384
+; GCN-NEXT:    ds_write_b128 v0, a[92:95] offset:16400
 ; GCN-NEXT:    s_endpgm
 entry:
   call void @llvm.amdgcn.iglp.opt(i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 2c67f4a8cb7df..10878248f46be 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -2162,26 +2162,30 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa
 ;
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
-; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_lshr_b32 s20, s5, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s21, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s22, s7, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s23, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s24, s9, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s25, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s26, s11, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s27, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s28, s13, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s29, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s30, s15, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s31, s14, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s33, s17, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s34, s16, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s35, s19, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s36, s18, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s18, s1, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s19, s0, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s20, s3, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s21, s2, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s22, s5, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s23, s4, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s24, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s25, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s26, s9, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s27, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s28, s11, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s29, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s30, s13, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s31, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s33, s15, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s34, s14, 16
+; GCN-HSA-NEXT:    s_and_b32 s35, s1, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s36, s0, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s7, s7, 0xffff
@@ -2194,80 +2198,76 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; GCN-HSA-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s15, s15, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s14, s14, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s17, s17, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s16, s16, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s19, s19, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s18, s18, 0xffff
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s34
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s17
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s33
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s30
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s31
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s30
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s26
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s35
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32:
@@ -2548,112 +2548,112 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa
 ;
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
-; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_ashr_i32 s20, s5, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s21, s4, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s22, s7, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s23, s6, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s24, s9, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s25, s8, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s26, s11, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s27, s10, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s28, s13, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s29, s12, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s30, s15, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s31, s14, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s33, s17, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s34, s16, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s35, s19, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s36, s18, 16
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s16, s16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s19, s19
-; GCN-HSA-NEXT:    s_sext_i32_i16 s18, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    s_sext_i32_i16 s17, s17
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-HSA-NEXT:    s_ashr_i32 s18, s1, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s19, s0, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s22, s3, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s23, s2, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s24, s5, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s25, s4, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s26, s7, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s27, s6, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s28, s9, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s29, s8, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s30, s11, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s31, s10, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s33, s13, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s34, s12, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s35, s15, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s36, s14, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s21, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    s_sext_i32_i16 s20, s1
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s12, s12
+; GCN-HSA-NEXT:    s_sext_i32_i16 s15, s15
+; GCN-HSA-NEXT:    s_sext_i32_i16 s14, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
+; GCN-HSA-NEXT:    s_sext_i32_i16 s13, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s34
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s17
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s33
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT:    s_sext_i32_i16 s15, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s30
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s13, s13
-; GCN-HSA-NEXT:    s_sext_i32_i16 s12, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s11, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s10, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s26
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s8, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-HSA-NEXT:    s_sext_i32_i16 s6, s6
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 32
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_sext_i32_i16 s5, s5
-; GCN-HSA-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s26
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s5, s5
+; GCN-HSA-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-HSA-NEXT:    s_sext_i32_i16 s2, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s22
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i32:
@@ -2853,28 +2853,28 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa
 define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x10
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s1, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s0, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s3, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s2, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s1, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s35, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s39, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s40, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s41, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s42, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s43, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s44, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s45, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s46, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s47, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s48, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s49, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s51, s1, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s52, s0, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s53, s3, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s54, s2, 0xffff
@@ -2890,67 +2890,67 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s55, s37, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s56, s36, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s57, s39, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s58, s38, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s59, s41, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s60, s40, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s61, s43, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s62, s42, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s63, s45, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s64, s44, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s65, s47, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s66, s46, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s67, s49, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s68, s48, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s69, s51, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s70, s50, 16
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s37, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s36, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s39, s39, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s38, s38, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s40, s40, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s43, s43, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s42, s42, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s45, s45, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s44, s44, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s47, s47, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s46, s46, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s49, s49, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s48, s48, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s51, s51, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s50, s50, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s41, s41, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s55, s17, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s56, s16, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s57, s19, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s58, s18, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s59, s21, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s60, s20, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s61, s23, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s62, s22, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s63, s25, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s64, s24, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s65, s27, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s66, s26, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s67, s29, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s68, s28, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s69, s31, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s70, s30, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s19, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s18, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s20, s20, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s23, s23, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s22, s22, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s25, s25, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s24, s24, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s27, s27, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s26, s26, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s29, s29, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s28, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s31, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s30, s30, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s21, s21, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s36
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s37
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s30
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s70
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s51
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s69
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s48
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s28
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s68
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s49
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s29
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s67
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s46
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s26
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s66
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s47
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s27
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s65
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s44
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s24
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s45
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s25
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s63
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s42
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s22
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s62
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s43
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s23
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s40
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s61
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s60
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s41
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s59
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
@@ -2958,64 +2958,64 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s58
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s57
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s56
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s55
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s50
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s49
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s48
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s47
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s46
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s45
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s44
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s26
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s43
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s42
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s41
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s40
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s39
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s54
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s38
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s53
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s35
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s52
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s51
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s33
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -3024,32 +3024,65 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-HSA-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x10
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s20, s1, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s21, s0, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s22, s3, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s23, s2, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s24, s5, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s25, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s26, s7, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s27, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s28, s9, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s29, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s30, s11, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s31, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s33, s13, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s34, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s35, s15, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s52, s14, 16
-; GCN-HSA-NEXT:    s_and_b32 s1, s1, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s0, s0, 0xffff
+; GCN-HSA-NEXT:    s_lshr_b32 s26, s4, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s28, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s30, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s33, s9, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s35, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s37, s11, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s38, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s39, s13, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s40, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s41, s15, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s42, s14, 16
+; GCN-HSA-NEXT:    s_and_b32 s25, s1, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s27, s0, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s29, s3, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s31, s2, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s34, s5, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s36, s4, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s43, s7, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s44, s6, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s45, s9, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s46, s8, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s47, s11, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s48, s10, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s49, s13, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s50, s12, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s51, s15, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s52, s14, 0xffff
+; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x10
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    s_lshr_b32 s18, s1, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s19, s0, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s53, s3, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s54, s2, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s55, s5, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s56, s4, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s57, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s58, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s59, s9, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s60, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s61, s11, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s62, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s63, s13, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s64, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s65, s15, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s66, s14, 16
+; GCN-HSA-NEXT:    s_and_b32 s67, s1, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s68, s0, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s53, s7, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s54, s6, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s11, s11, 0xffff
@@ -3058,179 +3091,145 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s15, s15, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s14, s14, 0xffff
-; GCN-HSA-NEXT:    s_lshr_b32 s18, s37, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s19, s36, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s55, s39, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s56, s38, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s57, s41, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s58, s40, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s59, s43, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s60, s42, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s61, s45, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s62, s44, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s63, s47, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s64, s46, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s65, s49, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s66, s48, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s67, s51, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s68, s50, 16
-; GCN-HSA-NEXT:    s_and_b32 s37, s37, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s36, s36, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s39, s39, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s38, s38, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s41, s41, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s40, s40, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s43, s43, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s42, s42, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s45, s45, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s44, s44, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s47, s47, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s46, s46, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s49, s49, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s48, s48, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s51, s51, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s50, s50, 0xffff
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xe0
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xd0
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xc0
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0xa0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s47
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s63
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xf0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xe0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xd0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xc0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xa0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s62
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s61
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s45
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x90
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s59
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x80
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s68
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x70
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s42
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s60
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s43
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x80
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s66
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s65
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s58
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s7
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s59
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s57
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[27:28], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s66
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s49
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s65
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s40
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s63
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s41
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s57
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s39
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s55
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s56
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s55
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s54
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s68
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s53
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s19
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s37
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s14
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s67
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s52
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s18
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s52
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s42
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s41
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s40
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s49
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s39
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[20:23]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s30
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s47
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s37
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
-; GCN-HSA-NEXT:    s_add_u32 s6, s16, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s35
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s54
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s53
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s30
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s43
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_nop 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    s_add_u32 s4, s16, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-HSA-NEXT:    s_addc_u32 s5, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s26
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_nop 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s16, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-HSA-NEXT:    s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s31
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s29
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s27
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s25
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -3756,216 +3755,217 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ;
 ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
-; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x0
-; GCN-HSA-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x10
+; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_ashr_i32 s4, s17, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s5, s16, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s6, s17
-; GCN-HSA-NEXT:    s_sext_i32_i16 s7, s16
-; GCN-HSA-NEXT:    s_ashr_i32 s8, s19, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s9, s18, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s10, s19
-; GCN-HSA-NEXT:    s_sext_i32_i16 s11, s18
-; GCN-HSA-NEXT:    s_ashr_i32 s12, s21, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s13, s20, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s14, s21
-; GCN-HSA-NEXT:    s_sext_i32_i16 s15, s20
-; GCN-HSA-NEXT:    s_ashr_i32 s16, s23, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s17, s22, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s18, s23
-; GCN-HSA-NEXT:    s_sext_i32_i16 s19, s22
-; GCN-HSA-NEXT:    s_ashr_i32 s20, s25, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s21, s24, 16
-; GCN-HSA-NEXT:    s_sext_i32_i16 s22, s25
-; GCN-HSA-NEXT:    s_sext_i32_i16 s23, s24
-; GCN-HSA-NEXT:    s_ashr_i32 s24, s27, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s25, s26, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s33, s29, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s34, s28, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s35, s31, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s52, s30, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s53, s37, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s54, s36, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s55, s39, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s56, s38, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s57, s41, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s58, s40, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s59, s43, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s60, s42, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s61, s45, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s62, s44, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s63, s47, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s64, s46, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s65, s49, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s66, s48, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s67, s51, 16
-; GCN-HSA-NEXT:    s_ashr_i32 s68, s50, 16
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s47, s47
-; GCN-HSA-NEXT:    s_sext_i32_i16 s46, s46
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s46
+; GCN-HSA-NEXT:    s_ashr_i32 s20, s1, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s21, s0, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s22, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s23, s0
+; GCN-HSA-NEXT:    s_ashr_i32 s24, s3, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s25, s2, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s26, s3
+; GCN-HSA-NEXT:    s_sext_i32_i16 s27, s2
+; GCN-HSA-NEXT:    s_ashr_i32 s28, s5, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s29, s4, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s30, s5
+; GCN-HSA-NEXT:    s_sext_i32_i16 s31, s4
+; GCN-HSA-NEXT:    s_ashr_i32 s33, s7, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s34, s6, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s35, s7
+; GCN-HSA-NEXT:    s_sext_i32_i16 s36, s6
+; GCN-HSA-NEXT:    s_ashr_i32 s37, s9, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s38, s8, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s39, s9
+; GCN-HSA-NEXT:    s_sext_i32_i16 s40, s8
+; GCN-HSA-NEXT:    s_ashr_i32 s41, s11, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s42, s10, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s43, s11
+; GCN-HSA-NEXT:    s_sext_i32_i16 s44, s10
+; GCN-HSA-NEXT:    s_ashr_i32 s45, s13, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s46, s12, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s47, s13
+; GCN-HSA-NEXT:    s_sext_i32_i16 s48, s12
+; GCN-HSA-NEXT:    s_ashr_i32 s49, s15, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s50, s14, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s51, s15
+; GCN-HSA-NEXT:    s_sext_i32_i16 s52, s14
+; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x10
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    s_ashr_i32 s18, s1, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s19, s0, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s55, s3, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s56, s2, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s57, s5, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s58, s4, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s59, s7, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s60, s6, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s61, s9, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s62, s8, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s63, s11, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s64, s10, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s65, s13, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s66, s12, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s67, s15, 16
+; GCN-HSA-NEXT:    s_ashr_i32 s68, s14, 16
+; GCN-HSA-NEXT:    s_sext_i32_i16 s54, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xf0
+; GCN-HSA-NEXT:    s_sext_i32_i16 s53, s1
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xe0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xd0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xc0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s11, s11
+; GCN-HSA-NEXT:    s_sext_i32_i16 s10, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xa0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s47
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s63
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[8:11]
-; GCN-HSA-NEXT:    s_sext_i32_i16 s45, s45
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s44, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s44
+; GCN-HSA-NEXT:    s_sext_i32_i16 s9, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s8, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x90
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s61
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[12:15]
-; GCN-HSA-NEXT:    s_sext_i32_i16 s51, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_sext_i32_i16 s50, s50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s43, s43
-; GCN-HSA-NEXT:    s_sext_i32_i16 s42, s42
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-HSA-NEXT:    s_sext_i32_i16 s15, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x80
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_sext_i32_i16 s14, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s7, s7
+; GCN-HSA-NEXT:    s_sext_i32_i16 s6, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s68
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s42
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s60
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s43
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s7
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[0:3]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s59
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[27:28], v[16:19]
-; GCN-HSA-NEXT:    s_sext_i32_i16 s49, s49
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_sext_i32_i16 s48, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s36, s36
-; GCN-HSA-NEXT:    s_sext_i32_i16 s39, s39
-; GCN-HSA-NEXT:    s_sext_i32_i16 s38, s38
-; GCN-HSA-NEXT:    s_sext_i32_i16 s41, s41
-; GCN-HSA-NEXT:    s_sext_i32_i16 s40, s40
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s48
+; GCN-HSA-NEXT:    s_sext_i32_i16 s13, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    s_sext_i32_i16 s12, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s1
+; GCN-HSA-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-HSA-NEXT:    s_sext_i32_i16 s2, s2
+; GCN-HSA-NEXT:    s_sext_i32_i16 s5, s5
+; GCN-HSA-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s66
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s49
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s65
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    s_sext_i32_i16 s29, s29
-; GCN-HSA-NEXT:    s_sext_i32_i16 s28, s28
-; GCN-HSA-NEXT:    s_sext_i32_i16 s31, s31
-; GCN-HSA-NEXT:    s_sext_i32_i16 s30, s30
-; GCN-HSA-NEXT:    s_sext_i32_i16 s37, s37
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s40
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[4:7]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s41
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s57
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s39
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s36
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s54
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s55
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s54
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s37
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s30
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s53
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s19
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s53
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s18
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s52
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s49
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s47
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s45
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[20:23]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    s_sext_i32_i16 s27, s27
-; GCN-HSA-NEXT:    s_sext_i32_i16 s26, s26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s42
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s43
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s41
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s40
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s39
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s37
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s35
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s31
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s30
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s27
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s27
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s26
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s23
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s22
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s20
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s19
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s12
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
@@ -6050,260 +6050,262 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s3, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s20, s5, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s7, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s9, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s11, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s13, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s17, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s19, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s18, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s16, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s35, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s0, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s2, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s16, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s18, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s37, s1, 0xffff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s38, s3, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s17, s17, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s19, 0xffff
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s15
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s25
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s13
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s23
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s21
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s37
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s18
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s28
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s26
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s27
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s30
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s28
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s29
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s33
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s30
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s34
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s31
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s33
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s36
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s35
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
-; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_lshr_b32 s3, s5, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s20, s7, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s21, s9, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s22, s11, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s23, s13, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s24, s15, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s25, s17, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s26, s19, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s27, s18, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s28, s16, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s29, s14, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s30, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s31, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s33, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s34, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s2, s4, 16
-; GCN-HSA-NEXT:    s_and_b32 s35, s4, 0xffff
+; GCN-HSA-NEXT:    s_lshr_b32 s19, s1, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s20, s3, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s21, s5, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s22, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s23, s9, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s24, s11, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s25, s13, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s26, s15, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s27, s14, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s28, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s29, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s30, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s31, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s33, s4, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s34, s2, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s18, s0, 16
+; GCN-HSA-NEXT:    s_and_b32 s35, s0, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s2, s2, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s14, s14, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s16, s16, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s18, s18, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s36, s5, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s36, s1, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-HSA-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GCN-HSA-NEXT:    s_and_b32 s15, s15, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s17, s17, 0xffff
-; GCN-HSA-NEXT:    s_and_b32 s19, s19, 0xffff
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xd0
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x90
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s19
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xf0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xd0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x90
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s17
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s13
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s25
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s24
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s9
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s23
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s21
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 48
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s20
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xe0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xc0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xc0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xa0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s28
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xa0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x80
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x80
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s30
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x60
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 64
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s33
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 3c26845ea48a3..28539a49a965f 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
-; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
-; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A-GISEL %s
 ; RUN: llc -march=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s
 
 ; Check that Dst and SrcC of MFMA instructions reading more than 4 registers as SrcC
@@ -29,8 +29,15 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32:
-; GREEDY: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
-; GREEDY: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
+; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[18:33], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33]
+; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[2:17], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33]
+
+; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31]
+; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31]
+
+; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
+; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
+
 ; FAST:   v_mfma_f32_16x16x1{{.*}} a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
 ; FAST:   v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
 ; GCN:    v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]

diff  --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 57e5342480929..a615a5e6a7248 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -792,23 +792,17 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ;
 ; GLOBALNESS0-LABEL: kernel:
 ; GLOBALNESS0:       ; %bb.0: ; %bb
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s15, 0
 ; GLOBALNESS0-NEXT:    s_load_dwordx4 s[56:59], s[8:9], 0x0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s14, 1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s10, 2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s11, 3
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v44, 0
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s6, 4
 ; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v44, off
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s7, 5
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    global_load_dword v0, v44, s[56:57]
 ; GLOBALNESS0-NEXT:    s_mov_b32 s61, 0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s60, s61
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 6
 ; GLOBALNESS0-NEXT:    s_mov_b32 s62, s61
 ; GLOBALNESS0-NEXT:    s_mov_b32 s63, s61
 ; GLOBALNESS0-NEXT:    s_mov_b32 s64, s61
@@ -840,7 +834,6 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    s_mov_b32 s90, s61
 ; GLOBALNESS0-NEXT:    s_mov_b32 s91, s61
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a32, s60
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 7
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a33, s61
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a34, s62
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a35, s63
@@ -873,123 +866,123 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a62, s90
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a63, s91
 ; GLOBALNESS0-NEXT:    s_movk_i32 s60, 0x80
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 8
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 9
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 10
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 11
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 12
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 13
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 14
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 15
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 16
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 17
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 18
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 19
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 20
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 21
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 22
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 23
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 24
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 25
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s78, 26
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s79, 27
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s80, 28
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s81, 29
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s82, 30
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s83, 31
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s84, 32
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s85, 33
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s86, 34
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 6
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 7
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 11
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 12
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 13
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 15
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 16
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 17
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s78, 18
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s79, 19
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s80, 20
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s81, 21
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s82, 22
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s83, 23
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s84, 24
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s85, 25
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s86, 26
+; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s87, 35
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s88, 36
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s89, 37
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s87, 27
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s88, 28
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s89, 29
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, 0x40994400
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s90, 38
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s90, 30
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[8:9]
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s91, 39
+; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s91, 31
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[4:5], v[44:45]
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s8, 40
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s8, 32
 ; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s9, 41
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s9, 33
 ; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s58, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 42
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 43
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 34
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 35
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    s_xor_b64 s[38:39], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_xor_b64 s[46:47], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 44
+; GLOBALNESS0-NEXT:    s_xor_b64 s[50:51], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 45
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS0-NEXT:    s_add_u32 s6, s6, wobble at gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s7, s7, wobble at gotpcrel32@hi+12
-; GLOBALNESS0-NEXT:    s_xor_b64 s[100:101], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_xor_b64 s[52:53], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 46
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 47
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[4:5], 1, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 48
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 49
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 36
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 37
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 50
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 51
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 38
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 39
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 52
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 40
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 41
+; GLOBALNESS0-NEXT:    s_mov_b32 s57, 0x3ff00000
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s56, 42
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s57, 43
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s58, 44
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s59, 45
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 46
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 47
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 48
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 49
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 50
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 51
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 52
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 53
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 54
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s78, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 55
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s79, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 56
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s80, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 57
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s81, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 58
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s82, 4
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[100:101], s[6:7], 0x0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 59
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s83, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 60
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s84, 6
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 61
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s85, 7
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 62
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s86, 8
+; GLOBALNESS0-NEXT:    s_mov_b32 s44, s16
+; GLOBALNESS0-NEXT:    s_mov_b32 s45, s15
+; GLOBALNESS0-NEXT:    s_mov_b32 s42, s14
+; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 53
-; GLOBALNESS0-NEXT:    s_mov_b32 s41, 0x3ff00000
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s40, 54
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s50, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s51, 1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s52, 2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s53, 3
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s54, 4
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s55, 5
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s56, 6
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s57, 7
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s58, 8
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s59, 9
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s60, 10
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s61, 11
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s62, 12
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s41, 55
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s63, 13
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s42, 56
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s64, 14
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s43, 57
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s65, 15
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s44, 58
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s66, 16
-; GLOBALNESS0-NEXT:    s_load_dwordx2 s[36:37], s[6:7], 0x0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s45, 59
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s67, 17
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s46, 60
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s68, 18
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s47, 61
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s69, 19
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s48, 62
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s70, 20
-; GLOBALNESS0-NEXT:    s_mov_b32 s33, s16
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s49, 63
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s71, 21
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[48:49], 1, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 63
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s87, 9
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 52
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 53
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 40
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 41
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_29
 ; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow6
@@ -999,7 +992,7 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow19
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a63, v31
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v42, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v42, 10
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a62, v30
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a61, v29
@@ -1032,61 +1025,58 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a34, v2
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a33, v1
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a32, v0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v42, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v42, 11
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS0-NEXT:    ; Child Loop BB1_17 Depth 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 1
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
-; GLOBALNESS0-NEXT:    s_add_u32 s8, s48, 40
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS0-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GLOBALNESS0-NEXT:    flat_load_dword v46, v[0:1]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s10, v41, 2
-; GLOBALNESS0-NEXT:    s_addc_u32 s9, s49, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 7
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 5
-; GLOBALNESS0-NEXT:    v_readlane_b32 s11, v41, 3
-; GLOBALNESS0-NEXT:    v_readlane_b32 s12, v41, 1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s13, v41, 0
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s33
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s42
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s45
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s44
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 3
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 31
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[100:101]
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS0-NEXT:    ; kill: killed $sgpr4_sgpr5
@@ -1115,37 +1105,41 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow18
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s8, 22
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s9, 23
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s8, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s9, 11
 ; GLOBALNESS0-NEXT:  .LBB1_10: ; %Flow16
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
-; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 5
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[64:65]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 31
 ; GLOBALNESS0-NEXT:    s_mov_b32 s68, s57
 ; GLOBALNESS0-NEXT:    s_mov_b32 s69, s57
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s57
@@ -1196,68 +1190,74 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 3
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_3
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    flat_load_dword v0, v[0:1]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 3
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 31
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 54
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 55
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 56
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 57
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 58
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 59
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 60
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 61
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 62
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 63
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v42, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v42, 1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v42, 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v42, 3
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v42, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v42, 5
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v42, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v42, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 42
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 43
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 44
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 45
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 46
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 47
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 48
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 49
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 50
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 51
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 52
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 53
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 54
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 55
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 56
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 57
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 58
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 59
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 60
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 61
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 62
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 63
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v42, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v42, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v42, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v42, 3
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v42, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v42, 5
 ; GLOBALNESS0-NEXT:    s_mov_b32 s60, s57
 ; GLOBALNESS0-NEXT:    s_mov_b32 s62, s57
 ; GLOBALNESS0-NEXT:    s_mov_b32 s63, s61
@@ -1269,63 +1269,53 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    s_mov_b32 s69, s61
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s57
 ; GLOBALNESS0-NEXT:    s_mov_b32 s71, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s41, s61
-; GLOBALNESS0-NEXT:    s_mov_b64 s[96:97], s[54:55]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v42, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v42, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v42, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v42, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v42, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v42, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v42, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v42, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v42, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v42, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v42, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v42, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v42, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v42, 21
 ; GLOBALNESS0-NEXT:    s_mov_b32 s72, s57
 ; GLOBALNESS0-NEXT:    s_mov_b32 s73, s61
 ; GLOBALNESS0-NEXT:    s_mov_b32 s74, s57
 ; GLOBALNESS0-NEXT:    s_mov_b32 s75, s61
 ; GLOBALNESS0-NEXT:    s_mov_b32 s76, s57
 ; GLOBALNESS0-NEXT:    s_mov_b32 s77, s61
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s40, 54
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s50, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s51, 1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s52, 2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s53, 3
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s54, 4
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s55, 5
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s56, 6
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s57, 7
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s58, 8
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s59, 9
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s60, 10
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s61, 11
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s62, 12
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s41, 55
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s63, 13
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s42, 56
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s64, 14
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s43, 57
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s65, 15
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s44, 58
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s66, 16
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s45, 59
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s67, 17
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s46, 60
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s68, 18
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s47, 61
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s69, 19
+; GLOBALNESS0-NEXT:    s_mov_b32 s57, s61
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s56, 42
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s57, 43
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s58, 44
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s59, 45
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 46
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 47
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 48
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 49
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 50
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 51
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 52
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 53
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v42, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v42, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v42, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v42, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 54
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s78, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 55
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s79, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 56
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s80, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 57
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s81, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 58
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s82, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 59
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s83, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 60
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s84, 6
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 61
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s85, 7
+; GLOBALNESS0-NEXT:    s_mov_b64 s[92:93], s[54:55]
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s48, 62
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s70, 20
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[54:55], 0, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 62
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s86, 8
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s49, 63
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s71, 21
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 63
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s87, 9
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[62:63], s[62:63] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[64:65], s[64:65] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[66:67], s[66:67] op_sel:[0,1]
@@ -1341,17 +1331,15 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[86:87], s[86:87] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[88:89], s[88:89] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[90:91], s[90:91] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[34:35], s[6:7]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[88:89], s[54:55]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 46
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s6, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 47
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s7, 25
-; GLOBALNESS0-NEXT:    s_mov_b32 s99, s59
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 37
+; GLOBALNESS0-NEXT:    s_mov_b32 s91, s59
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
 ; GLOBALNESS0-NEXT:  ; %bb.13: ; %bb39.i
@@ -1362,81 +1350,69 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 40
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 42
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 32
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1]
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[58:59], 0, v2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 41
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 43
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 33
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_17
 ; GLOBALNESS0-NEXT:  .LBB1_15: ; %Flow7
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb63.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[100:101]
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[52:53]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
 ; GLOBALNESS0-NEXT:  .LBB1_17: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[38:39]
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[46:47]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_16
 ; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb46.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 44
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 45
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[50:51]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_16
 ; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb50.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[60:61]
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[62:63]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_22
 ; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb3.i.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[62:63]
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[60:61]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_22
 ; GLOBALNESS0-NEXT:  ; %bb.21: ; %bb6.i.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[56:57]
 ; GLOBALNESS0-NEXT:  .LBB1_22: ; %spam.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 48
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 49
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[48:49]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_16
 ; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb55.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
-; GLOBALNESS0-NEXT:    s_add_u32 s64, s48, 40
-; GLOBALNESS0-NEXT:    v_readlane_b32 s40, v41, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s42, v41, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s44, v41, 2
-; GLOBALNESS0-NEXT:    s_addc_u32 s65, s49, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s41, v41, 7
-; GLOBALNESS0-NEXT:    v_readlane_b32 s43, v41, 5
-; GLOBALNESS0-NEXT:    v_readlane_b32 s45, v41, 3
-; GLOBALNESS0-NEXT:    v_readlane_b32 s46, v41, 1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s47, v41, 0
+; GLOBALNESS0-NEXT:    s_add_u32 s64, s38, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s65, s39, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[42:43]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[64:65]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[44:45]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s46
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s47
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s33
+; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s42
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s45
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s44
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[100:101]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[42:43]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[64:65]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[44:45]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s46
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s47
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s33
+; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s42
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s45
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s44
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[0:1], a[32:33], off
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[100:101]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[58:59]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.24: ; %bb62.i
@@ -1447,250 +1423,10 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_15
 ; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow14
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s56, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s57, v41, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s58, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s59, v41, 11
-; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[48:49]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[56:57]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 17
-; GLOBALNESS0-NEXT:    s_mov_b32 s56, s49
-; GLOBALNESS0-NEXT:    s_mov_b32 s57, s49
-; GLOBALNESS0-NEXT:    s_mov_b32 s58, s49
-; GLOBALNESS0-NEXT:    s_mov_b32 s59, s49
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 19
-; GLOBALNESS0-NEXT:    s_mov_b32 s60, s49
-; GLOBALNESS0-NEXT:    s_mov_b32 s61, s49
-; GLOBALNESS0-NEXT:    s_mov_b32 s62, s49
-; GLOBALNESS0-NEXT:    s_mov_b32 s63, s49
-; GLOBALNESS0-NEXT:    s_mov_b32 s64, s49
-; GLOBALNESS0-NEXT:    s_mov_b32 s65, s49
-; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[56:57]
-; GLOBALNESS0-NEXT:    s_mov_b32 s66, s49
-; GLOBALNESS0-NEXT:    s_mov_b32 s67, s49
-; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[58:59]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[62:63]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[60:61], s[64:65]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 31
-; GLOBALNESS0-NEXT:    s_mov_b64 s[62:63], s[66:67]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 32
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 33
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 35
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 37
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 39
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s48, 8
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s49, 9
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s50, 10
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s51, 11
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s52, 12
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s53, 13
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s54, 14
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s55, 15
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s56, 16
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s57, 17
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s58, 18
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s59, 19
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 20
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 21
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 22
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 23
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 24
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 25
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 26
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 27
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 28
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 29
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 30
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 31
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 32
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 33
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 34
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 35
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 36
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 37
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s78, 38
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s79, 39
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[64:65]
-; GLOBALNESS0-NEXT:    s_mov_b32 s64, s49
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[52:53]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
-; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
-; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
-; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[54:55]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[56:57]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[58:59]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[62:63]
-; GLOBALNESS0-NEXT:    s_mov_b32 s60, s64
+; GLOBALNESS0-NEXT:    v_readlane_b32 s56, v41, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s57, v41, 1
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
-; GLOBALNESS0-NEXT:    s_mov_b64 s[44:45], s[64:65]
-; GLOBALNESS0-NEXT:    s_mov_b32 s61, s45
-; GLOBALNESS0-NEXT:    s_mov_b64 s[44:45], s[48:49]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
-; GLOBALNESS0-NEXT:    s_mov_b64 s[46:47], s[50:51]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[52:53]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[54:55]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[56:57]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[58:59]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
-; GLOBALNESS0-NEXT:    s_mov_b32 s58, s61
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[60:61]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
-; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
-; GLOBALNESS0-NEXT:    s_mov_b32 s59, s37
-; GLOBALNESS0-NEXT:    s_mov_b64 s[62:63], s[58:59]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
-; GLOBALNESS0-NEXT:    s_mov_b64 s[60:61], s[56:57]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[54:55]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[52:53]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[50:51]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[48:49]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[46:47]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[44:45]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
@@ -1707,94 +1443,63 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    s_mov_b32 s64, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s65, s57
+; GLOBALNESS0-NEXT:    v_readlane_b32 s59, v41, 3
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
-; GLOBALNESS0-NEXT:    s_mov_b32 s64, s65
-; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[48:49]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[42:43], s[50:51]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[44:45], s[52:53]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[46:47], s[54:55]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[56:57]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[58:59]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[60:61]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[62:63]
-; GLOBALNESS0-NEXT:    s_mov_b32 s56, s65
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
-; GLOBALNESS0-NEXT:    s_mov_b32 s57, s61
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[40:41], s[40:41] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[42:43], s[42:43] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[44:45], s[44:45] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[46:47], s[46:47] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[48:49], s[48:49] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[50:51], s[50:51] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[52:53], s[52:53] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[54:55], s[54:55] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[56:57], s[56:57] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[58:59], s[58:59] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[60:61], s[60:61] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[62:63], s[62:63] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[64:65], s[64:65] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[66:67], s[66:67] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[68:69], s[68:69] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[70:71], s[70:71] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[6:7]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v42, 24
-; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GLOBALNESS0-NEXT:    s_mov_b32 s59, s99
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v42, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
-; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b32 s66, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s67, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s68, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s69, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s70, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s71, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s72, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s73, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s74, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s75, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s76, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s77, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s78, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s79, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s57
+; GLOBALNESS0-NEXT:    s_mov_b32 s81, s57
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[64:65], s[64:65] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s58, v41, 2
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[66:67], s[66:67] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    s_mov_b32 s59, s91
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 7
 ; GLOBALNESS0-NEXT:  .LBB1_26: ; %Flow15
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[96:97]
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[88:89]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[54:55]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[92:93]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 50
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 51
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 39
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
@@ -1814,40 +1519,32 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_32
 ; GLOBALNESS0-NEXT:  ; %bb.31: ; %bb7.i.i
-; GLOBALNESS0-NEXT:    s_add_u32 s8, s48, 40
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s10, v41, 2
-; GLOBALNESS0-NEXT:    s_addc_u32 s9, s49, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 7
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 5
-; GLOBALNESS0-NEXT:    v_readlane_b32 s11, v41, 3
-; GLOBALNESS0-NEXT:    v_readlane_b32 s12, v41, 1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s13, v41, 0
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s33
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s42
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s45
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s44
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
-; GLOBALNESS0-NEXT:    s_mov_b32 s34, s33
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS0-NEXT:    s_mov_b32 s33, s34
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
 ; GLOBALNESS0-NEXT:  .LBB1_32: ; %Flow
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_34
 ; GLOBALNESS0-NEXT:  ; %bb.33: ; %bb11.i.i
-; GLOBALNESS0-NEXT:    s_add_u32 s8, s48, 40
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s10, v41, 2
-; GLOBALNESS0-NEXT:    s_addc_u32 s9, s49, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 7
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 5
-; GLOBALNESS0-NEXT:    v_readlane_b32 s11, v41, 3
-; GLOBALNESS0-NEXT:    v_readlane_b32 s12, v41, 1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s13, v41, 0
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s33
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s42
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s45
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s44
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4


        


More information about the llvm-commits mailing list